Connection to the drive

# Mounting drive
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
!pip install emoji 

Requirement already satisfied: emoji in /usr/local/lib/python3.7/dist-packages (1.4.2)
!pip install emojis

Requirement already satisfied: emojis in /usr/local/lib/python3.7/dist-packages (0.6.0)

#Packages related to general operating system & warnings
import os 
import warnings
warnings.filterwarnings('ignore')

# Importing required libraries and packages
import emoji
import emojis
import pandas as pd
pd.options.display.max_columns = 110  # None -> No Restrictions
pd.options.display.max_rows = 200    # None -> Be careful with this 
pd.options.display.max_colwidth = 60
pd.options.display.precision = 1
pd.options.display.max_info_columns = 200
import numpy as np
from numpy import unique
from numpy import where
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.ticker import ScalarFormatter, FormatStrFormatter
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import scipy.cluster.hierarchy as sch
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import AgglomerativeClustering

print(emoji.emojize(":laptop:")*28
      ,"\n\nAll the required libraries and packages are imported successfully !!!\n\n"
      ,emoji.emojize(":laptop:")*28)
💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻 

All the required libraries and packages are imported successfully !!!

 💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻💻

#loading the dataset
data = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/1004_FIFA/players_20.csv')

print(emoji.emojize(":file_folder:")*12
      ,"\n\nData loaded successfully !!!\n\n"
      ,emoji.emojize(":file_folder:")*12)
📁📁📁📁📁📁📁📁📁📁📁📁 

Data loaded successfully !!!

 📁📁📁📁📁📁📁📁📁📁📁📁

# To have a glimpse of the data
print("\nGlimpse of data : "
,emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

data.head(10).style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#A60B2E'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
).highlight_max(color='#0074FF',axis=0)\
.highlight_min(color='#00FFE5',axis=0)\
.highlight_null(null_color='#CCB3C')
Glimpse of data :  👇🏻👇🏻👇🏻

sofifa_id player_url short_name long_name age dob height_cm weight_kg nationality club overall potential value_eur wage_eur player_positions preferred_foot international_reputation weak_foot skill_moves work_rate body_type real_face release_clause_eur player_tags team_position team_jersey_number loaned_from joined contract_valid_until nation_position nation_jersey_number pace shooting passing dribbling defending physic gk_diving gk_handling gk_kicking gk_reflexes gk_speed gk_positioning player_traits attacking_crossing attacking_finishing attacking_heading_accuracy attacking_short_passing attacking_volleys skill_dribbling skill_curve skill_fk_accuracy skill_long_passing skill_ball_control movement_acceleration movement_sprint_speed movement_agility movement_reactions movement_balance power_shot_power power_jumping power_stamina power_strength power_long_shots mentality_aggression mentality_interceptions mentality_positioning mentality_vision mentality_penalties mentality_composure defending_marking defending_standing_tackle defending_sliding_tackle goalkeeping_diving goalkeeping_handling goalkeeping_kicking goalkeeping_positioning goalkeeping_reflexes ls st rs lw lf cf rf rw lam cam ram lm lcm cm rcm rm lwb ldm cdm rdm rwb lb lcb cb rcb rb
0 158023 https://sofifa.com/player/158023/lionel-messi/20/159586 L. Messi Lionel Andrés Messi Cuccittini 32 1987-06-24 170 72 Argentina FC Barcelona 94 94 95500000 565000 RW, CF, ST Left 5 4 4 Medium/Low Messi Yes 195800000.0 #Dribbler, #Distance Shooter, #Crosser, #FK Specialist, #Acrobat, #Clinical Finisher, #Complete Forward RW 10.0 nan 2004-07-01 2021.0 nan nan 87.0 92.0 92.0 96.0 39.0 66.0 nan nan nan nan nan nan Beat Offside Trap, Argues with Officials, Early Crosser, Finesse Shot, Speed Dribbler (CPU AI Only), 1-on-1 Rush, Giant Throw-in, Outside Foot Shot 88 95 70 92 88 97 93 94 92 96 91 84 93 95 95 86 68 75 68 94 48 40 94 94 75 96 33 37 26 6 11 15 14 8 89+2 89+2 89+2 93+2 93+2 93+2 93+2 93+2 93+2 93+2 93+2 92+2 87+2 87+2 87+2 92+2 68+2 66+2 66+2 66+2 68+2 63+2 52+2 52+2 52+2 63+2
1 20801 https://sofifa.com/player/20801/c-ronaldo-dos-santos-aveiro/20/159586 Cristiano Ronaldo Cristiano Ronaldo dos Santos Aveiro 34 1985-02-05 187 83 Portugal Juventus 93 93 58500000 405000 ST, LW Right 5 4 5 High/Low C. Ronaldo Yes 96500000.0 #Speedster, #Dribbler, #Distance Shooter, #Acrobat, #Clinical Finisher, #Complete Forward LW 7.0 nan 2018-07-10 2022.0 LS 7.0 90.0 93.0 82.0 89.0 35.0 78.0 nan nan nan nan nan nan Long Throw-in, Selfish, Argues with Officials, Early Crosser, Speed Dribbler (CPU AI Only), Skilled Dribbling 84 94 89 83 87 89 81 76 77 92 89 91 87 96 71 95 95 85 78 93 63 29 95 82 85 95 28 32 24 7 11 15 14 11 91+3 91+3 91+3 89+3 90+3 90+3 90+3 89+3 88+3 88+3 88+3 88+3 81+3 81+3 81+3 88+3 65+3 61+3 61+3 61+3 65+3 61+3 53+3 53+3 53+3 61+3
2 190871 https://sofifa.com/player/190871/neymar-da-silva-santos-jr/20/159586 Neymar Jr Neymar da Silva Santos Junior 27 1992-02-05 175 68 Brazil Paris Saint-Germain 92 92 105500000 290000 LW, CAM Right 5 5 5 High/Medium Neymar Yes 195200000.0 #Speedster, #Dribbler, #Playmaker  , #Crosser, #FK Specialist, #Acrobat, #Clinical Finisher, #Complete Midfielder, #Complete Forward CAM 10.0 nan 2017-08-03 2022.0 LW 10.0 91.0 85.0 87.0 95.0 32.0 58.0 nan nan nan nan nan nan Power Free-Kick, Injury Free, Selfish, Early Crosser, Speed Dribbler (CPU AI Only), Crowd Favourite 87 87 62 87 87 96 88 87 81 95 94 89 96 92 84 80 61 81 49 84 51 36 87 90 90 94 27 26 29 9 9 15 15 11 84+3 84+3 84+3 90+3 89+3 89+3 89+3 90+3 90+3 90+3 90+3 89+3 82+3 82+3 82+3 89+3 66+3 61+3 61+3 61+3 66+3 61+3 46+3 46+3 46+3 61+3
3 200389 https://sofifa.com/player/200389/jan-oblak/20/159586 J. Oblak Jan Oblak 26 1993-01-07 188 87 Slovenia Atlético Madrid 91 93 77500000 125000 GK Right 3 3 1 Medium/Medium Normal Yes 164700000.0 nan GK 13.0 nan 2014-07-16 2023.0 GK 1.0 nan nan nan nan nan nan 87.0 92.0 78.0 89.0 52.0 90.0 Flair, Acrobatic Clearance 13 11 15 43 13 12 13 14 40 30 43 60 67 88 49 59 78 41 78 12 34 19 11 65 11 68 27 12 18 87 92 78 90 89 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
4 183277 https://sofifa.com/player/183277/eden-hazard/20/159586 E. Hazard Eden Hazard 28 1991-01-07 175 74 Belgium Real Madrid 91 91 90000000 470000 LW, CF Right 4 4 4 High/Medium Normal Yes 184500000.0 #Speedster, #Dribbler, #Acrobat LW 7.0 nan 2019-07-01 2024.0 LF 10.0 91.0 83.0 86.0 94.0 35.0 66.0 nan nan nan nan nan nan Beat Offside Trap, Selfish, Finesse Shot, Speed Dribbler (CPU AI Only), Crowd Favourite 81 84 61 89 83 95 83 79 83 94 94 88 95 90 94 82 56 84 63 80 54 41 87 89 88 91 34 27 22 11 12 6 8 8 83+3 83+3 83+3 89+3 88+3 88+3 88+3 89+3 89+3 89+3 89+3 89+3 83+3 83+3 83+3 89+3 66+3 63+3 63+3 63+3 66+3 61+3 49+3 49+3 49+3 61+3
5 192985 https://sofifa.com/player/192985/kevin-de-bruyne/20/159586 K. De Bruyne Kevin De Bruyne 28 1991-06-28 181 70 Belgium Manchester City 91 91 90000000 370000 CAM, CM Right 4 5 4 High/High Normal Yes 166500000.0 #Dribbler, #Playmaker  , #Engine, #Distance Shooter, #Crosser, #Complete Midfielder RCM 17.0 nan 2015-08-30 2023.0 RCM 7.0 76.0 86.0 92.0 86.0 61.0 78.0 nan nan nan nan nan nan Power Free-Kick, Avoids Using Weaker Foot, Dives Into Tackles (CPU AI Only), Leadership, Argues with Officials, Finesse Shot 93 82 55 92 82 86 85 83 91 91 77 76 78 91 76 91 63 89 74 90 76 61 88 94 79 91 68 58 51 15 13 5 10 13 82+3 82+3 82+3 87+3 87+3 87+3 87+3 87+3 88+3 88+3 88+3 88+3 87+3 87+3 87+3 88+3 77+3 77+3 77+3 77+3 77+3 73+3 66+3 66+3 66+3 73+3
6 192448 https://sofifa.com/player/192448/marc-andre-ter-stegen/20/159586 M. ter Stegen Marc-André ter Stegen 27 1992-04-30 187 85 Germany FC Barcelona 90 93 67500000 250000 GK Right 3 4 1 Medium/Medium Normal Yes 143400000.0 nan GK 1.0 nan 2014-07-01 2022.0 SUB 22.0 nan nan nan nan nan nan 88.0 85.0 88.0 90.0 45.0 88.0 Swerve Pass, Acrobatic Clearance, Flair Passes 18 14 11 61 14 21 18 12 63 30 38 50 37 86 43 66 79 35 78 10 43 22 11 70 25 70 25 13 10 88 85 88 88 90 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
7 203376 https://sofifa.com/player/203376/virgil-van-dijk/20/159586 V. van Dijk Virgil van Dijk 27 1991-07-08 193 92 Netherlands Liverpool 90 91 78000000 200000 CB Right 3 3 2 Medium/Medium Normal Yes 150200000.0 #Tackling , #Tactician , #Strength, #Complete Defender LCB 4.0 nan 2018-01-01 2023.0 LCB 4.0 77.0 60.0 70.0 71.0 90.0 86.0 nan nan nan nan nan nan Diver, Avoids Using Weaker Foot, Leadership, Long Passer (CPU AI Only) 53 52 86 78 45 70 60 70 81 76 74 79 61 88 53 81 90 75 92 64 82 89 47 65 62 89 91 92 85 13 10 13 11 11 69+3 69+3 69+3 67+3 69+3 69+3 69+3 67+3 69+3 69+3 69+3 69+3 74+3 74+3 74+3 69+3 79+3 83+3 83+3 83+3 79+3 81+3 87+3 87+3 87+3 81+3
8 177003 https://sofifa.com/player/177003/luka-modric/20/159586 L. Modrić Luka Modrić 33 1985-09-09 172 66 Croatia Real Madrid 90 90 45000000 340000 CM Right 4 4 4 High/High Lean Yes 92300000.0 #Dribbler, #Playmaker  , #Crosser, #Acrobat, #Complete Midfielder RCM 10.0 nan 2012-08-01 2020.0 nan nan 74.0 76.0 89.0 89.0 72.0 66.0 nan nan nan nan nan nan Argues with Officials, Finesse Shot, Speed Dribbler (CPU AI Only), Crowd Favourite 86 72 55 92 76 87 85 78 88 92 77 71 92 89 93 79 68 85 58 82 62 82 79 91 82 92 68 76 71 13 9 7 14 9 77+3 77+3 77+3 84+3 83+3 83+3 83+3 84+3 86+3 86+3 86+3 85+3 87+3 87+3 87+3 85+3 81+3 81+3 81+3 81+3 81+3 79+3 72+3 72+3 72+3 79+3
9 209331 https://sofifa.com/player/209331/mohamed-salah/20/159586 M. Salah Mohamed Salah Ghaly 27 1992-06-15 175 71 Egypt Liverpool 90 90 80500000 240000 RW, ST Left 3 3 4 High/Medium PLAYER_BODY_TYPE_25 Yes 148900000.0 #Speedster, #Dribbler, #Acrobat, #Clinical Finisher, #Complete Forward RW 11.0 nan 2017-07-01 2023.0 RW 10.0 93.0 86.0 81.0 89.0 45.0 74.0 nan nan nan nan nan nan Beat Offside Trap, Argues with Officials, Early Crosser, Speed Dribbler (CPU AI Only), Outside Foot Shot 79 90 59 84 79 89 83 69 75 89 94 92 91 92 88 80 69 85 73 84 63 55 92 84 77 91 38 43 41 14 14 9 11 14 84+3 84+3 84+3 88+3 88+3 88+3 88+3 88+3 87+3 87+3 87+3 87+3 81+3 81+3 81+3 87+3 70+3 67+3 67+3 67+3 70+3 66+3 57+3 57+3 57+3 66+3

Basic Info of the dataset

#finding the no. of rows and cols
print("\nFinding the no. of rows and cols in the dataset : \n\n"
,emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("") 

print("No. of Players : {}".format(data.shape[0]))
print("No. of features : {} ".format(data.shape[1]))
Finding the no. of rows and cols in the dataset : 

 👇🏻👇🏻👇🏻

No. of Players : 18278
No. of features : 104 

# Overview of shape, attributes, types and missing values
print("\nOverview of shape, attributes, types and missing values : \n\n"
,"\t\t",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("") 

overview = data.info(verbose=True)
Overview of shape, attributes, types and missing values : 

 		 👇🏻👇🏻👇🏻

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18278 entries, 0 to 18277
Data columns (total 104 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   sofifa_id                   18278 non-null  int64  
 1   player_url                  18278 non-null  object 
 2   short_name                  18278 non-null  object 
 3   long_name                   18278 non-null  object 
 4   age                         18278 non-null  int64  
 5   dob                         18278 non-null  object 
 6   height_cm                   18278 non-null  int64  
 7   weight_kg                   18278 non-null  int64  
 8   nationality                 18278 non-null  object 
 9   club                        18278 non-null  object 
 10  overall                     18278 non-null  int64  
 11  potential                   18278 non-null  int64  
 12  value_eur                   18278 non-null  int64  
 13  wage_eur                    18278 non-null  int64  
 14  player_positions            18278 non-null  object 
 15  preferred_foot              18278 non-null  object 
 16  international_reputation    18278 non-null  int64  
 17  weak_foot                   18278 non-null  int64  
 18  skill_moves                 18278 non-null  int64  
 19  work_rate                   18278 non-null  object 
 20  body_type                   18278 non-null  object 
 21  real_face                   18278 non-null  object 
 22  release_clause_eur          16980 non-null  float64
 23  player_tags                 1499 non-null   object 
 24  team_position               18038 non-null  object 
 25  team_jersey_number          18038 non-null  float64
 26  loaned_from                 1048 non-null   object 
 27  joined                      16990 non-null  object 
 28  contract_valid_until        18038 non-null  float64
 29  nation_position             1126 non-null   object 
 30  nation_jersey_number        1126 non-null   float64
 31  pace                        16242 non-null  float64
 32  shooting                    16242 non-null  float64
 33  passing                     16242 non-null  float64
 34  dribbling                   16242 non-null  float64
 35  defending                   16242 non-null  float64
 36  physic                      16242 non-null  float64
 37  gk_diving                   2036 non-null   float64
 38  gk_handling                 2036 non-null   float64
 39  gk_kicking                  2036 non-null   float64
 40  gk_reflexes                 2036 non-null   float64
 41  gk_speed                    2036 non-null   float64
 42  gk_positioning              2036 non-null   float64
 43  player_traits               7566 non-null   object 
 44  attacking_crossing          18278 non-null  int64  
 45  attacking_finishing         18278 non-null  int64  
 46  attacking_heading_accuracy  18278 non-null  int64  
 47  attacking_short_passing     18278 non-null  int64  
 48  attacking_volleys           18278 non-null  int64  
 49  skill_dribbling             18278 non-null  int64  
 50  skill_curve                 18278 non-null  int64  
 51  skill_fk_accuracy           18278 non-null  int64  
 52  skill_long_passing          18278 non-null  int64  
 53  skill_ball_control          18278 non-null  int64  
 54  movement_acceleration       18278 non-null  int64  
 55  movement_sprint_speed       18278 non-null  int64  
 56  movement_agility            18278 non-null  int64  
 57  movement_reactions          18278 non-null  int64  
 58  movement_balance            18278 non-null  int64  
 59  power_shot_power            18278 non-null  int64  
 60  power_jumping               18278 non-null  int64  
 61  power_stamina               18278 non-null  int64  
 62  power_strength              18278 non-null  int64  
 63  power_long_shots            18278 non-null  int64  
 64  mentality_aggression        18278 non-null  int64  
 65  mentality_interceptions     18278 non-null  int64  
 66  mentality_positioning       18278 non-null  int64  
 67  mentality_vision            18278 non-null  int64  
 68  mentality_penalties         18278 non-null  int64  
 69  mentality_composure         18278 non-null  int64  
 70  defending_marking           18278 non-null  int64  
 71  defending_standing_tackle   18278 non-null  int64  
 72  defending_sliding_tackle    18278 non-null  int64  
 73  goalkeeping_diving          18278 non-null  int64  
 74  goalkeeping_handling        18278 non-null  int64  
 75  goalkeeping_kicking         18278 non-null  int64  
 76  goalkeeping_positioning     18278 non-null  int64  
 77  goalkeeping_reflexes        18278 non-null  int64  
 78  ls                          16242 non-null  object 
 79  st                          16242 non-null  object 
 80  rs                          16242 non-null  object 
 81  lw                          16242 non-null  object 
 82  lf                          16242 non-null  object 
 83  cf                          16242 non-null  object 
 84  rf                          16242 non-null  object 
 85  rw                          16242 non-null  object 
 86  lam                         16242 non-null  object 
 87  cam                         16242 non-null  object 
 88  ram                         16242 non-null  object 
 89  lm                          16242 non-null  object 
 90  lcm                         16242 non-null  object 
 91  cm                          16242 non-null  object 
 92  rcm                         16242 non-null  object 
 93  rm                          16242 non-null  object 
 94  lwb                         16242 non-null  object 
 95  ldm                         16242 non-null  object 
 96  cdm                         16242 non-null  object 
 97  rdm                         16242 non-null  object 
 98  rwb                         16242 non-null  object 
 99  lb                          16242 non-null  object 
 100 lcb                         16242 non-null  object 
 101 cb                          16242 non-null  object 
 102 rcb                         16242 non-null  object 
 103 rb                          16242 non-null  object 
dtypes: float64(16), int64(45), object(43)
memory usage: 14.5+ MB

📝📝 There are 45 int variables, 16 float variables and 43 object variables in the dataset.

# General stats of the numerical variables
print("\nGeneral stats of the numerical variables : "
,emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

data.describe().style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#753976'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
General stats of the numerical variables :  👇🏻👇🏻👇🏻

sofifa_id age height_cm weight_kg overall potential value_eur wage_eur international_reputation weak_foot skill_moves release_clause_eur team_jersey_number contract_valid_until nation_jersey_number pace shooting passing dribbling defending physic gk_diving gk_handling gk_kicking gk_reflexes gk_speed gk_positioning attacking_crossing attacking_finishing attacking_heading_accuracy attacking_short_passing attacking_volleys skill_dribbling skill_curve skill_fk_accuracy skill_long_passing skill_ball_control movement_acceleration movement_sprint_speed movement_agility movement_reactions movement_balance power_shot_power power_jumping power_stamina power_strength power_long_shots mentality_aggression mentality_interceptions mentality_positioning mentality_vision mentality_penalties mentality_composure defending_marking defending_standing_tackle defending_sliding_tackle goalkeeping_diving goalkeeping_handling goalkeeping_kicking goalkeeping_positioning goalkeeping_reflexes
count 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 16980.0 18038.0 18038.0 1126.0 16242.0 16242.0 16242.0 16242.0 16242.0 16242.0 2036.0 2036.0 2036.0 2036.0 2036.0 2036.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0 18278.0
mean 219738.9 25.3 181.4 75.3 66.2 71.5 2484037.6 9456.9 1.1 2.9 2.4 4740717.4 20.1 2021.1 12.1 67.7 52.3 57.2 62.5 51.6 64.9 65.4 63.1 61.8 66.4 37.8 63.4 49.7 45.6 52.2 58.7 42.8 55.6 47.3 42.7 52.8 58.5 64.3 64.4 63.5 61.8 63.9 58.2 64.9 62.9 65.2 46.8 55.7 46.4 50.1 53.6 48.4 58.5 46.8 47.6 45.6 16.6 16.4 16.2 16.4 16.7
std 27960.2 4.7 6.8 7.0 6.9 6.1 5585481.1 21351.7 0.4 0.7 0.8 11030016.3 16.6 1.3 6.7 11.3 14.0 10.4 10.3 16.4 9.8 7.7 7.2 7.5 8.2 10.6 8.4 18.3 19.6 17.4 14.7 17.7 18.9 18.4 17.4 15.2 16.7 15.0 14.8 14.8 9.1 14.2 13.3 11.9 16.0 12.5 19.3 17.3 20.8 19.6 14.0 15.7 11.9 20.1 21.6 21.2 17.7 17.0 16.6 17.1 18.0
min 768.0 16.0 156.0 50.0 48.0 49.0 0.0 0.0 1.0 1.0 1.0 13000.0 1.0 2019.0 1.0 24.0 15.0 24.0 23.0 15.0 27.0 44.0 42.0 35.0 45.0 12.0 41.0 5.0 2.0 5.0 7.0 3.0 4.0 6.0 4.0 8.0 5.0 12.0 11.0 11.0 21.0 12.0 14.0 19.0 12.0 20.0 4.0 9.0 3.0 2.0 9.0 7.0 12.0 1.0 5.0 3.0 1.0 1.0 1.0 1.0 1.0
25% 204445.5 22.0 177.0 70.0 62.0 67.0 325000.0 1000.0 1.0 3.0 2.0 563000.0 9.0 2020.0 6.0 61.0 42.0 50.0 57.0 36.0 59.0 60.0 58.0 57.0 60.8 29.0 58.0 38.0 30.0 44.0 54.0 30.0 50.0 34.0 31.0 43.0 54.0 56.0 57.0 55.0 56.0 56.0 48.0 58.0 56.0 58.0 32.0 44.0 25.0 39.0 44.0 39.0 51.0 29.0 27.0 24.0 8.0 8.0 8.0 8.0 8.0
50% 226165.0 25.0 181.0 75.0 66.0 71.0 700000.0 3000.0 1.0 3.0 2.0 1200000.0 17.0 2021.0 12.0 69.0 54.0 58.0 64.0 56.0 66.0 65.0 63.0 61.0 66.0 39.0 64.0 54.0 49.0 56.0 62.0 44.0 61.0 49.0 41.0 56.0 63.0 67.0 67.0 66.0 62.0 66.0 59.0 66.0 66.0 66.0 51.0 58.0 52.0 55.0 55.0 49.0 60.0 52.0 55.0 52.0 11.0 11.0 11.0 11.0 11.0
75% 240795.8 29.0 186.0 80.0 71.0 75.0 2100000.0 8000.0 1.0 3.0 3.0 3700000.0 27.0 2022.0 18.0 75.0 63.0 64.0 69.0 65.0 72.0 70.0 68.0 66.0 72.0 46.0 69.0 64.0 62.0 64.0 68.0 56.0 68.0 62.0 56.0 64.0 69.0 75.0 75.0 74.0 68.0 74.0 68.0 73.0 74.0 74.0 62.0 69.0 64.0 64.0 64.0 60.0 67.0 64.0 66.0 64.0 14.0 14.0 14.0 14.0 14.0
max 252905.0 42.0 205.0 110.0 94.0 95.0 105500000.0 565000.0 5.0 5.0 5.0 195800000.0 99.0 2026.0 30.0 96.0 93.0 92.0 96.0 90.0 90.0 90.0 92.0 93.0 92.0 65.0 91.0 93.0 95.0 93.0 92.0 90.0 97.0 94.0 94.0 92.0 96.0 97.0 96.0 96.0 96.0 97.0 95.0 95.0 97.0 97.0 94.0 95.0 92.0 95.0 94.0 92.0 96.0 94.0 92.0 90.0 90.0 92.0 93.0 91.0 92.0

📝📝 Essence of above dataframe :
✏️ The min. age of the players is 16, avg. age is 25 and the max. age upto which they can play is 42 years.
✏️ The min. height of the players is 156cm, avg. height is 181cm and the max. height is 205cm.
✏️ The min. weight of the players should be 50Kg, avg. weight should be 75Kg and the max. weight should not exceed than 110Kg.
✏️ The min. overall score of the players is 48, avg. overall score is 66 and the max. overall score is 94 out of 100.
✏️ The min. potential score of the players is 49, avg. potential score is 71.5 and the max. overall score is 95 out of 100.
✏️ The avg. valuation of the players is around 2.5 million Euros and the max. valuation is 105 million Euros.
✏️ The avg. wage of the players is around 9.5k Euros and the max. wage is around 0.56 million Euros.

# General stats of the categorical variables
print("\nGeneral stats of the categorical variables : "
,emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")
data.describe(include=['object']).style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#117A65'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
General stats of the categorical variables :  👇🏻👇🏻👇🏻

player_url short_name long_name dob nationality club player_positions preferred_foot work_rate body_type real_face player_tags team_position loaned_from joined nation_position player_traits ls st rs lw lf cf rf rw lam cam ram lm lcm cm rcm rm lwb ldm cdm rdm rwb lb lcb cb rcb rb
count 18278 18278 18278 18278 18278 18278 18278 18278 18278 18278 18278 1499 18038 1048 16990 1126 7566 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242 16242
unique 18278 17354 18218 6142 162 698 643 2 9 10 2 83 29 316 1760 26 921 94 94 94 107 103 103 103 107 103 103 103 100 88 88 88 100 98 98 98 98 98 96 110 110 110 96
top https://sofifa.com/player/208507/daniel-barrio-alvarez/20/159586 J. Rodríguez Liam Kelly 1992-02-29 England Manchester United CB Right Medium/Medium Normal No #Strength SUB Sassuolo 2019-07-01 SUB Early Crosser 61+2 61+2 61+2 63+2 63+2 63+2 63+2 63+2 61+2 61+2 61+2 61+2 58+2 58+2 58+2 61+2 59+2 59+2 59+2 59+2 59+2 61+2 63+2 63+2 63+2 61+2
freq 1 11 3 113 1667 33 2322 13960 9875 10750 16310 514 7820 17 1465 587 501 725 725 725 736 727 727 727 736 748 748 748 809 775 775 775 809 682 636 636 636 682 667 621 621 621 667

📝📝 Essence of above dataframe :
✏️ There were 162 different countries participated in FIFA 2020 and most players were from England .
✏️ There were 698 different clubs and the top clubs was CD Leganés.
✏️ The most preferred position of the players was CB.
✏️ The most preferred foot by the players was Right.
✏️ The work rate of most of the players is Medium/Medium.
✏️ The body type of most of the players is Normal.

Data Preparation</h1> </div> </div> </div>

1. Finding duplicate values

# checking for duplicate values if present in the dataframe
print("Duplicate Data"
,emoji.emojize(":red_question_mark:")*2,"\n")
print(emoji.emojize(":check_mark_button:")*3
,"\n\n",data.duplicated().any()
,"\n\n",emoji.emojize(":check_mark_button:")*3)
Duplicate Data ❓❓ 

✅✅✅ 

 False 

 ✅✅✅

📝📝 There are no duplicate values in the dataset.

2. Finding Missing Values

#checking for missing values
print("Missing values"
      ,emoji.emojize(":red_question_mark:")*2,"\n")
print(emoji.emojize(":cross_mark_button::")*3
,"\n\n",data.isnull().values.any()
,"\n\n",emoji.emojize(":cross_mark_button:")*3)
Missing values ❓❓ 

❎:❎:❎: 

 True 

 ❎❎❎

# missing values in the dataset
print('\nMissing Values  \n'
,"\n",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")
data.isnull().values.sum()
Missing Values  
 
 👇🏻👇🏻👇🏻

244935

print('\nMissing Values in the dataset : '
,emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

data.head(10).style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#610646'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
).highlight_null(null_color='#CCB3C5')
Missing Values in the dataset :  👇🏻👇🏻👇🏻

sofifa_id player_url short_name long_name age dob height_cm weight_kg nationality club overall potential value_eur wage_eur player_positions preferred_foot international_reputation weak_foot skill_moves work_rate body_type real_face release_clause_eur player_tags team_position team_jersey_number loaned_from joined contract_valid_until nation_position nation_jersey_number pace shooting passing dribbling defending physic gk_diving gk_handling gk_kicking gk_reflexes gk_speed gk_positioning player_traits attacking_crossing attacking_finishing attacking_heading_accuracy attacking_short_passing attacking_volleys skill_dribbling skill_curve skill_fk_accuracy skill_long_passing skill_ball_control movement_acceleration movement_sprint_speed movement_agility movement_reactions movement_balance power_shot_power power_jumping power_stamina power_strength power_long_shots mentality_aggression mentality_interceptions mentality_positioning mentality_vision mentality_penalties mentality_composure defending_marking defending_standing_tackle defending_sliding_tackle goalkeeping_diving goalkeeping_handling goalkeeping_kicking goalkeeping_positioning goalkeeping_reflexes ls st rs lw lf cf rf rw lam cam ram lm lcm cm rcm rm lwb ldm cdm rdm rwb lb lcb cb rcb rb
0 158023 https://sofifa.com/player/158023/lionel-messi/20/159586 L. Messi Lionel Andrés Messi Cuccittini 32 1987-06-24 170 72 Argentina FC Barcelona 94 94 95500000 565000 RW, CF, ST Left 5 4 4 Medium/Low Messi Yes 195800000.0 #Dribbler, #Distance Shooter, #Crosser, #FK Specialist, #Acrobat, #Clinical Finisher, #Complete Forward RW 10.0 nan 2004-07-01 2021.0 nan nan 87.0 92.0 92.0 96.0 39.0 66.0 nan nan nan nan nan nan Beat Offside Trap, Argues with Officials, Early Crosser, Finesse Shot, Speed Dribbler (CPU AI Only), 1-on-1 Rush, Giant Throw-in, Outside Foot Shot 88 95 70 92 88 97 93 94 92 96 91 84 93 95 95 86 68 75 68 94 48 40 94 94 75 96 33 37 26 6 11 15 14 8 89+2 89+2 89+2 93+2 93+2 93+2 93+2 93+2 93+2 93+2 93+2 92+2 87+2 87+2 87+2 92+2 68+2 66+2 66+2 66+2 68+2 63+2 52+2 52+2 52+2 63+2
1 20801 https://sofifa.com/player/20801/c-ronaldo-dos-santos-aveiro/20/159586 Cristiano Ronaldo Cristiano Ronaldo dos Santos Aveiro 34 1985-02-05 187 83 Portugal Juventus 93 93 58500000 405000 ST, LW Right 5 4 5 High/Low C. Ronaldo Yes 96500000.0 #Speedster, #Dribbler, #Distance Shooter, #Acrobat, #Clinical Finisher, #Complete Forward LW 7.0 nan 2018-07-10 2022.0 LS 7.0 90.0 93.0 82.0 89.0 35.0 78.0 nan nan nan nan nan nan Long Throw-in, Selfish, Argues with Officials, Early Crosser, Speed Dribbler (CPU AI Only), Skilled Dribbling 84 94 89 83 87 89 81 76 77 92 89 91 87 96 71 95 95 85 78 93 63 29 95 82 85 95 28 32 24 7 11 15 14 11 91+3 91+3 91+3 89+3 90+3 90+3 90+3 89+3 88+3 88+3 88+3 88+3 81+3 81+3 81+3 88+3 65+3 61+3 61+3 61+3 65+3 61+3 53+3 53+3 53+3 61+3
2 190871 https://sofifa.com/player/190871/neymar-da-silva-santos-jr/20/159586 Neymar Jr Neymar da Silva Santos Junior 27 1992-02-05 175 68 Brazil Paris Saint-Germain 92 92 105500000 290000 LW, CAM Right 5 5 5 High/Medium Neymar Yes 195200000.0 #Speedster, #Dribbler, #Playmaker  , #Crosser, #FK Specialist, #Acrobat, #Clinical Finisher, #Complete Midfielder, #Complete Forward CAM 10.0 nan 2017-08-03 2022.0 LW 10.0 91.0 85.0 87.0 95.0 32.0 58.0 nan nan nan nan nan nan Power Free-Kick, Injury Free, Selfish, Early Crosser, Speed Dribbler (CPU AI Only), Crowd Favourite 87 87 62 87 87 96 88 87 81 95 94 89 96 92 84 80 61 81 49 84 51 36 87 90 90 94 27 26 29 9 9 15 15 11 84+3 84+3 84+3 90+3 89+3 89+3 89+3 90+3 90+3 90+3 90+3 89+3 82+3 82+3 82+3 89+3 66+3 61+3 61+3 61+3 66+3 61+3 46+3 46+3 46+3 61+3
3 200389 https://sofifa.com/player/200389/jan-oblak/20/159586 J. Oblak Jan Oblak 26 1993-01-07 188 87 Slovenia Atlético Madrid 91 93 77500000 125000 GK Right 3 3 1 Medium/Medium Normal Yes 164700000.0 nan GK 13.0 nan 2014-07-16 2023.0 GK 1.0 nan nan nan nan nan nan 87.0 92.0 78.0 89.0 52.0 90.0 Flair, Acrobatic Clearance 13 11 15 43 13 12 13 14 40 30 43 60 67 88 49 59 78 41 78 12 34 19 11 65 11 68 27 12 18 87 92 78 90 89 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
4 183277 https://sofifa.com/player/183277/eden-hazard/20/159586 E. Hazard Eden Hazard 28 1991-01-07 175 74 Belgium Real Madrid 91 91 90000000 470000 LW, CF Right 4 4 4 High/Medium Normal Yes 184500000.0 #Speedster, #Dribbler, #Acrobat LW 7.0 nan 2019-07-01 2024.0 LF 10.0 91.0 83.0 86.0 94.0 35.0 66.0 nan nan nan nan nan nan Beat Offside Trap, Selfish, Finesse Shot, Speed Dribbler (CPU AI Only), Crowd Favourite 81 84 61 89 83 95 83 79 83 94 94 88 95 90 94 82 56 84 63 80 54 41 87 89 88 91 34 27 22 11 12 6 8 8 83+3 83+3 83+3 89+3 88+3 88+3 88+3 89+3 89+3 89+3 89+3 89+3 83+3 83+3 83+3 89+3 66+3 63+3 63+3 63+3 66+3 61+3 49+3 49+3 49+3 61+3
5 192985 https://sofifa.com/player/192985/kevin-de-bruyne/20/159586 K. De Bruyne Kevin De Bruyne 28 1991-06-28 181 70 Belgium Manchester City 91 91 90000000 370000 CAM, CM Right 4 5 4 High/High Normal Yes 166500000.0 #Dribbler, #Playmaker  , #Engine, #Distance Shooter, #Crosser, #Complete Midfielder RCM 17.0 nan 2015-08-30 2023.0 RCM 7.0 76.0 86.0 92.0 86.0 61.0 78.0 nan nan nan nan nan nan Power Free-Kick, Avoids Using Weaker Foot, Dives Into Tackles (CPU AI Only), Leadership, Argues with Officials, Finesse Shot 93 82 55 92 82 86 85 83 91 91 77 76 78 91 76 91 63 89 74 90 76 61 88 94 79 91 68 58 51 15 13 5 10 13 82+3 82+3 82+3 87+3 87+3 87+3 87+3 87+3 88+3 88+3 88+3 88+3 87+3 87+3 87+3 88+3 77+3 77+3 77+3 77+3 77+3 73+3 66+3 66+3 66+3 73+3
6 192448 https://sofifa.com/player/192448/marc-andre-ter-stegen/20/159586 M. ter Stegen Marc-André ter Stegen 27 1992-04-30 187 85 Germany FC Barcelona 90 93 67500000 250000 GK Right 3 4 1 Medium/Medium Normal Yes 143400000.0 nan GK 1.0 nan 2014-07-01 2022.0 SUB 22.0 nan nan nan nan nan nan 88.0 85.0 88.0 90.0 45.0 88.0 Swerve Pass, Acrobatic Clearance, Flair Passes 18 14 11 61 14 21 18 12 63 30 38 50 37 86 43 66 79 35 78 10 43 22 11 70 25 70 25 13 10 88 85 88 88 90 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
7 203376 https://sofifa.com/player/203376/virgil-van-dijk/20/159586 V. van Dijk Virgil van Dijk 27 1991-07-08 193 92 Netherlands Liverpool 90 91 78000000 200000 CB Right 3 3 2 Medium/Medium Normal Yes 150200000.0 #Tackling , #Tactician , #Strength, #Complete Defender LCB 4.0 nan 2018-01-01 2023.0 LCB 4.0 77.0 60.0 70.0 71.0 90.0 86.0 nan nan nan nan nan nan Diver, Avoids Using Weaker Foot, Leadership, Long Passer (CPU AI Only) 53 52 86 78 45 70 60 70 81 76 74 79 61 88 53 81 90 75 92 64 82 89 47 65 62 89 91 92 85 13 10 13 11 11 69+3 69+3 69+3 67+3 69+3 69+3 69+3 67+3 69+3 69+3 69+3 69+3 74+3 74+3 74+3 69+3 79+3 83+3 83+3 83+3 79+3 81+3 87+3 87+3 87+3 81+3
8 177003 https://sofifa.com/player/177003/luka-modric/20/159586 L. Modrić Luka Modrić 33 1985-09-09 172 66 Croatia Real Madrid 90 90 45000000 340000 CM Right 4 4 4 High/High Lean Yes 92300000.0 #Dribbler, #Playmaker  , #Crosser, #Acrobat, #Complete Midfielder RCM 10.0 nan 2012-08-01 2020.0 nan nan 74.0 76.0 89.0 89.0 72.0 66.0 nan nan nan nan nan nan Argues with Officials, Finesse Shot, Speed Dribbler (CPU AI Only), Crowd Favourite 86 72 55 92 76 87 85 78 88 92 77 71 92 89 93 79 68 85 58 82 62 82 79 91 82 92 68 76 71 13 9 7 14 9 77+3 77+3 77+3 84+3 83+3 83+3 83+3 84+3 86+3 86+3 86+3 85+3 87+3 87+3 87+3 85+3 81+3 81+3 81+3 81+3 81+3 79+3 72+3 72+3 72+3 79+3
9 209331 https://sofifa.com/player/209331/mohamed-salah/20/159586 M. Salah Mohamed Salah Ghaly 27 1992-06-15 175 71 Egypt Liverpool 90 90 80500000 240000 RW, ST Left 3 3 4 High/Medium PLAYER_BODY_TYPE_25 Yes 148900000.0 #Speedster, #Dribbler, #Acrobat, #Clinical Finisher, #Complete Forward RW 11.0 nan 2017-07-01 2023.0 RW 10.0 93.0 86.0 81.0 89.0 45.0 74.0 nan nan nan nan nan nan Beat Offside Trap, Argues with Officials, Early Crosser, Speed Dribbler (CPU AI Only), Outside Foot Shot 79 90 59 84 79 89 83 69 75 89 94 92 91 92 88 80 69 85 73 84 63 55 92 84 77 91 38 43 41 14 14 9 11 14 84+3 84+3 84+3 88+3 88+3 88+3 88+3 88+3 87+3 87+3 87+3 87+3 81+3 81+3 81+3 87+3 70+3 67+3 67+3 67+3 70+3 66+3 57+3 57+3 57+3 66+3

📝📝 There are 244935 missing values in the dataset.

# missing values in every column of the dataset
print('\nMissing Values in every column of the data : \n\n'
," ",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

missing = data.isnull().sum().sort_values(ascending=False)
missing = missing.head(50)
missing = missing.to_frame()
missing.columns = ['missing_values']
missing.style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#A15F86'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
Missing Values in every column of the data : 

   👇🏻👇🏻👇🏻

missing_values
loaned_from 17230
nation_jersey_number 17152
nation_position 17152
player_tags 16779
gk_diving 16242
gk_handling 16242
gk_kicking 16242
gk_reflexes 16242
gk_speed 16242
gk_positioning 16242
player_traits 10712
rb 2036
st 2036
ls 2036
dribbling 2036
shooting 2036
rcb 2036
pace 2036
lw 2036
passing 2036
physic 2036
rs 2036
defending 2036
lf 2036
rw 2036
cf 2036
cam 2036
ram 2036
lm 2036
lcm 2036
cm 2036
rcm 2036
lam 2036
rm 2036
lwb 2036
ldm 2036
cdm 2036
rdm 2036
rwb 2036
lb 2036
lcb 2036
cb 2036
rf 2036
release_clause_eur 1298
joined 1288
contract_valid_until 240
team_jersey_number 240
team_position 240
real_face 0
body_type 0

📝📝 There are missing values in 48 columns of the dataset and no. of missing values in those 48 columns can be seen from the above.

3. Finding Features with one value

# All the features with their unique values
print('\nUnique Values in each column of the data : \n\n'
,"\t",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")
for column in data.columns:
  print(emoji.emojize(":arrow_right:", use_aliases=True)
  ,column
  ,emoji.emojize(":1234:", use_aliases=True)
  ,data[column].nunique())
Unique Values in each column of the data : 

 	 👇🏻👇🏻👇🏻

➡ sofifa_id 🔢 18278
➡ player_url 🔢 18278
➡ short_name 🔢 17354
➡ long_name 🔢 18218
➡ age 🔢 27
➡ dob 🔢 6142
➡ height_cm 🔢 49
➡ weight_kg 🔢 56
➡ nationality 🔢 162
➡ club 🔢 698
➡ overall 🔢 47
➡ potential 🔢 47
➡ value_eur 🔢 214
➡ wage_eur 🔢 141
➡ player_positions 🔢 643
➡ preferred_foot 🔢 2
➡ international_reputation 🔢 5
➡ weak_foot 🔢 5
➡ skill_moves 🔢 5
➡ work_rate 🔢 9
➡ body_type 🔢 10
➡ real_face 🔢 2
➡ release_clause_eur 🔢 1224
➡ player_tags 🔢 83
➡ team_position 🔢 29
➡ team_jersey_number 🔢 99
➡ loaned_from 🔢 316
➡ joined 🔢 1760
➡ contract_valid_until 🔢 8
➡ nation_position 🔢 26
➡ nation_jersey_number 🔢 30
➡ pace 🔢 70
➡ shooting 🔢 79
➡ passing 🔢 68
➡ dribbling 🔢 71
➡ defending 🔢 76
➡ physic 🔢 61
➡ gk_diving 🔢 47
➡ gk_handling 🔢 47
➡ gk_kicking 🔢 52
➡ gk_reflexes 🔢 48
➡ gk_speed 🔢 53
➡ gk_positioning 🔢 50
➡ player_traits 🔢 921
➡ attacking_crossing 🔢 88
➡ attacking_finishing 🔢 93
➡ attacking_heading_accuracy 🔢 89
➡ attacking_short_passing 🔢 84
➡ attacking_volleys 🔢 87
➡ skill_dribbling 🔢 92
➡ skill_curve 🔢 89
➡ skill_fk_accuracy 🔢 89
➡ skill_long_passing 🔢 85
➡ skill_ball_control 🔢 90
➡ movement_acceleration 🔢 86
➡ movement_sprint_speed 🔢 86
➡ movement_agility 🔢 83
➡ movement_reactions 🔢 72
➡ movement_balance 🔢 81
➡ power_shot_power 🔢 80
➡ power_jumping 🔢 74
➡ power_stamina 🔢 86
➡ power_strength 🔢 75
➡ power_long_shots 🔢 90
➡ mentality_aggression 🔢 87
➡ mentality_interceptions 🔢 89
➡ mentality_positioning 🔢 94
➡ mentality_vision 🔢 85
➡ mentality_penalties 🔢 86
➡ mentality_composure 🔢 85
➡ defending_marking 🔢 92
➡ defending_standing_tackle 🔢 88
➡ defending_sliding_tackle 🔢 88
➡ goalkeeping_diving 🔢 73
➡ goalkeeping_handling 🔢 71
➡ goalkeeping_kicking 🔢 81
➡ goalkeeping_positioning 🔢 76
➡ goalkeeping_reflexes 🔢 75
➡ ls 🔢 94
➡ st 🔢 94
➡ rs 🔢 94
➡ lw 🔢 107
➡ lf 🔢 103
➡ cf 🔢 103
➡ rf 🔢 103
➡ rw 🔢 107
➡ lam 🔢 103
➡ cam 🔢 103
➡ ram 🔢 103
➡ lm 🔢 100
➡ lcm 🔢 88
➡ cm 🔢 88
➡ rcm 🔢 88
➡ rm 🔢 100
➡ lwb 🔢 98
➡ ldm 🔢 98
➡ cdm 🔢 98
➡ rdm 🔢 98
➡ rwb 🔢 98
➡ lb 🔢 96
➡ lcb 🔢 110
➡ cb 🔢 110
➡ rcb 🔢 110
➡ rb 🔢 96

📝📝 There is no feature having only 1 value.

4. Inserting a new column 'positions' based on the 'player_positions'

# function for dividing the player positions into 4 main categories of positions 
def player_pos(row):
    positions = row['player_positions'].split(', ') 
    N = len(positions)
    if N < 3:
        pos = positions[0]
        if pos in ['ST', 'LW', 'RW','CF']: 
            return 0 # Attacker
        elif pos in ['CAM', 'LM', 'CM', 'RM', 'CDM']: 
            return 1 # Midfielder
        elif pos in ['LWB', 'RWB', 'LB', 'CB', 'RB']:
            return 2 # Defender
        elif pos in ['GK']:
            return 3 # Goalkeeper
    else: 
        position_counter = [0, 0, 0, 0] 
        for pos in positions:
            if pos in ['ST', 'LW', 'RW','CF']: 
                index = 0 # Attacker
            elif pos in ['CAM', 'LM', 'CM', 'RM', 'CDM']: 
                index = 1 # Midfielder
            elif pos in ['LWB', 'RWB', 'LB', 'CB', 'RB']: 
                index = 2 # Defender
            elif pos in ['GK']:
                index = 3 # Goalkeeper
            else:
                continue 
            position_counter[index] += 1 

        return position_counter.index(max(position_counter))

# creating a new column and applying the above function on it
data['positions'] = data.apply(player_pos, axis=1)

# replacing the int values with corresponding positions
data.replace({'positions' : { 0 : 'Attacker' , 1 : 'Midfielder' , 2 : 'Defender' , 3 : 'Goalkeeper' }},inplace=True)
def color_red(val):
  if val=='Attacker':
    color = 'brown'
  elif val=='Defender':
    color = 'darkgreen'
  elif val=='Midfielder':
    color = 'orange'  
  else:
    color = 'darkblue'
  return 'color: %s' % color

print('\nNew Column added as "postions" \n\n'
," ",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

data[['positions']].head(50).style.applymap(color_red).set_table_styles(
[{'selector': 'th',
  'props': [('background', '#035753'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
New Column added as "postions" 

   👇🏻👇🏻👇🏻

positions
0 Attacker
1 Attacker
2 Attacker
3 Goalkeeper
4 Attacker
5 Midfielder
6 Goalkeeper
7 Defender
8 Midfielder
9 Attacker
10 Attacker
11 Defender
12 Attacker
13 Goalkeeper
14 Goalkeeper
15 Midfielder
16 Defender
17 Attacker
18 Defender
19 Attacker
20 Attacker
21 Midfielder
22 Attacker
23 Midfielder
24 Midfielder
25 Goalkeeper
26 Attacker
27 Midfielder
28 Goalkeeper
29 Defender
30 Goalkeeper
31 Goalkeeper
32 Goalkeeper
33 Midfielder
34 Attacker
35 Defender
36 Midfielder
37 Midfielder
38 Attacker
39 Attacker
40 Defender
41 Midfielder
42 Midfielder
43 Attacker
44 Midfielder
45 Defender
46 Attacker
47 Defender
48 Attacker
49 Defender

Exploratory Data Analysis

Exploring numerical features

#list of numerical variables
numerical_features = [feature for feature in data.columns if (data[feature].dtypes!='O')]
print('No. of numerical variables'
,emoji.emojize(":backhand_index_pointing_right_light_skin_tone:")*2
,len(numerical_features))
print("")
#all the numerical variables
for feature in numerical_features:
    print("The variable"
    ,emoji.emojize(":memo:")
    ,"{}".format(feature)
    ,"has datatype"
    ,emoji.emojize(":1234:", use_aliases=True)
    ,"{}".format(data[feature].dtypes)
    ,"and"
    ,emoji.emojize(":backhand_index_pointing_right_light_skin_tone:")
    ,"{}".format(len(data[feature].unique()))
    ,"unique values")
 
No. of numerical variables 👉🏻👉🏻 61

The variable 📝 sofifa_id has datatype 🔢 int64 and 👉🏻 18278 unique values
The variable 📝 age has datatype 🔢 int64 and 👉🏻 27 unique values
The variable 📝 height_cm has datatype 🔢 int64 and 👉🏻 49 unique values
The variable 📝 weight_kg has datatype 🔢 int64 and 👉🏻 56 unique values
The variable 📝 overall has datatype 🔢 int64 and 👉🏻 47 unique values
The variable 📝 potential has datatype 🔢 int64 and 👉🏻 47 unique values
The variable 📝 value_eur has datatype 🔢 int64 and 👉🏻 214 unique values
The variable 📝 wage_eur has datatype 🔢 int64 and 👉🏻 141 unique values
The variable 📝 international_reputation has datatype 🔢 int64 and 👉🏻 5 unique values
The variable 📝 weak_foot has datatype 🔢 int64 and 👉🏻 5 unique values
The variable 📝 skill_moves has datatype 🔢 int64 and 👉🏻 5 unique values
The variable 📝 release_clause_eur has datatype 🔢 float64 and 👉🏻 1225 unique values
The variable 📝 team_jersey_number has datatype 🔢 float64 and 👉🏻 100 unique values
The variable 📝 contract_valid_until has datatype 🔢 float64 and 👉🏻 9 unique values
The variable 📝 nation_jersey_number has datatype 🔢 float64 and 👉🏻 31 unique values
The variable 📝 pace has datatype 🔢 float64 and 👉🏻 71 unique values
The variable 📝 shooting has datatype 🔢 float64 and 👉🏻 80 unique values
The variable 📝 passing has datatype 🔢 float64 and 👉🏻 69 unique values
The variable 📝 dribbling has datatype 🔢 float64 and 👉🏻 72 unique values
The variable 📝 defending has datatype 🔢 float64 and 👉🏻 77 unique values
The variable 📝 physic has datatype 🔢 float64 and 👉🏻 62 unique values
The variable 📝 gk_diving has datatype 🔢 float64 and 👉🏻 48 unique values
The variable 📝 gk_handling has datatype 🔢 float64 and 👉🏻 48 unique values
The variable 📝 gk_kicking has datatype 🔢 float64 and 👉🏻 53 unique values
The variable 📝 gk_reflexes has datatype 🔢 float64 and 👉🏻 49 unique values
The variable 📝 gk_speed has datatype 🔢 float64 and 👉🏻 54 unique values
The variable 📝 gk_positioning has datatype 🔢 float64 and 👉🏻 51 unique values
The variable 📝 attacking_crossing has datatype 🔢 int64 and 👉🏻 88 unique values
The variable 📝 attacking_finishing has datatype 🔢 int64 and 👉🏻 93 unique values
The variable 📝 attacking_heading_accuracy has datatype 🔢 int64 and 👉🏻 89 unique values
The variable 📝 attacking_short_passing has datatype 🔢 int64 and 👉🏻 84 unique values
The variable 📝 attacking_volleys has datatype 🔢 int64 and 👉🏻 87 unique values
The variable 📝 skill_dribbling has datatype 🔢 int64 and 👉🏻 92 unique values
The variable 📝 skill_curve has datatype 🔢 int64 and 👉🏻 89 unique values
The variable 📝 skill_fk_accuracy has datatype 🔢 int64 and 👉🏻 89 unique values
The variable 📝 skill_long_passing has datatype 🔢 int64 and 👉🏻 85 unique values
The variable 📝 skill_ball_control has datatype 🔢 int64 and 👉🏻 90 unique values
The variable 📝 movement_acceleration has datatype 🔢 int64 and 👉🏻 86 unique values
The variable 📝 movement_sprint_speed has datatype 🔢 int64 and 👉🏻 86 unique values
The variable 📝 movement_agility has datatype 🔢 int64 and 👉🏻 83 unique values
The variable 📝 movement_reactions has datatype 🔢 int64 and 👉🏻 72 unique values
The variable 📝 movement_balance has datatype 🔢 int64 and 👉🏻 81 unique values
The variable 📝 power_shot_power has datatype 🔢 int64 and 👉🏻 80 unique values
The variable 📝 power_jumping has datatype 🔢 int64 and 👉🏻 74 unique values
The variable 📝 power_stamina has datatype 🔢 int64 and 👉🏻 86 unique values
The variable 📝 power_strength has datatype 🔢 int64 and 👉🏻 75 unique values
The variable 📝 power_long_shots has datatype 🔢 int64 and 👉🏻 90 unique values
The variable 📝 mentality_aggression has datatype 🔢 int64 and 👉🏻 87 unique values
The variable 📝 mentality_interceptions has datatype 🔢 int64 and 👉🏻 89 unique values
The variable 📝 mentality_positioning has datatype 🔢 int64 and 👉🏻 94 unique values
The variable 📝 mentality_vision has datatype 🔢 int64 and 👉🏻 85 unique values
The variable 📝 mentality_penalties has datatype 🔢 int64 and 👉🏻 86 unique values
The variable 📝 mentality_composure has datatype 🔢 int64 and 👉🏻 85 unique values
The variable 📝 defending_marking has datatype 🔢 int64 and 👉🏻 92 unique values
The variable 📝 defending_standing_tackle has datatype 🔢 int64 and 👉🏻 88 unique values
The variable 📝 defending_sliding_tackle has datatype 🔢 int64 and 👉🏻 88 unique values
The variable 📝 goalkeeping_diving has datatype 🔢 int64 and 👉🏻 73 unique values
The variable 📝 goalkeeping_handling has datatype 🔢 int64 and 👉🏻 71 unique values
The variable 📝 goalkeeping_kicking has datatype 🔢 int64 and 👉🏻 81 unique values
The variable 📝 goalkeeping_positioning has datatype 🔢 int64 and 👉🏻 76 unique values
The variable 📝 goalkeeping_reflexes has datatype 🔢 int64 and 👉🏻 75 unique values

Exploring categorical features.

# displaying each categorical feature with its unique no. of categories
categorical_features = [feature for feature in data.columns if (data[feature].dtypes=='O')]
print('No. of categorical variables'
,emoji.emojize(":backhand_index_pointing_right_light_skin_tone:")*2
,len(categorical_features))
print("")

for feature in categorical_features:
    print("The variable"
    ,emoji.emojize(":abcd:", use_aliases=True)
    ,"'{}'".format(feature)
    ,"has"
    ,emoji.emojize(":backhand_index_pointing_right_light_skin_tone:")
    ,"{}".format(len(data[feature].unique()))
    ,"unique values")
No. of categorical variables 👉🏻👉🏻 44

The variable 🔡 'player_url' has 👉🏻 18278 unique values
The variable 🔡 'short_name' has 👉🏻 17354 unique values
The variable 🔡 'long_name' has 👉🏻 18218 unique values
The variable 🔡 'dob' has 👉🏻 6142 unique values
The variable 🔡 'nationality' has 👉🏻 162 unique values
The variable 🔡 'club' has 👉🏻 698 unique values
The variable 🔡 'player_positions' has 👉🏻 643 unique values
The variable 🔡 'preferred_foot' has 👉🏻 2 unique values
The variable 🔡 'work_rate' has 👉🏻 9 unique values
The variable 🔡 'body_type' has 👉🏻 10 unique values
The variable 🔡 'real_face' has 👉🏻 2 unique values
The variable 🔡 'player_tags' has 👉🏻 84 unique values
The variable 🔡 'team_position' has 👉🏻 30 unique values
The variable 🔡 'loaned_from' has 👉🏻 317 unique values
The variable 🔡 'joined' has 👉🏻 1761 unique values
The variable 🔡 'nation_position' has 👉🏻 27 unique values
The variable 🔡 'player_traits' has 👉🏻 922 unique values
The variable 🔡 'ls' has 👉🏻 95 unique values
The variable 🔡 'st' has 👉🏻 95 unique values
The variable 🔡 'rs' has 👉🏻 95 unique values
The variable 🔡 'lw' has 👉🏻 108 unique values
The variable 🔡 'lf' has 👉🏻 104 unique values
The variable 🔡 'cf' has 👉🏻 104 unique values
The variable 🔡 'rf' has 👉🏻 104 unique values
The variable 🔡 'rw' has 👉🏻 108 unique values
The variable 🔡 'lam' has 👉🏻 104 unique values
The variable 🔡 'cam' has 👉🏻 104 unique values
The variable 🔡 'ram' has 👉🏻 104 unique values
The variable 🔡 'lm' has 👉🏻 101 unique values
The variable 🔡 'lcm' has 👉🏻 89 unique values
The variable 🔡 'cm' has 👉🏻 89 unique values
The variable 🔡 'rcm' has 👉🏻 89 unique values
The variable 🔡 'rm' has 👉🏻 101 unique values
The variable 🔡 'lwb' has 👉🏻 99 unique values
The variable 🔡 'ldm' has 👉🏻 99 unique values
The variable 🔡 'cdm' has 👉🏻 99 unique values
The variable 🔡 'rdm' has 👉🏻 99 unique values
The variable 🔡 'rwb' has 👉🏻 99 unique values
The variable 🔡 'lb' has 👉🏻 97 unique values
The variable 🔡 'lcb' has 👉🏻 111 unique values
The variable 🔡 'cb' has 👉🏻 111 unique values
The variable 🔡 'rcb' has 👉🏻 111 unique values
The variable 🔡 'rb' has 👉🏻 97 unique values
The variable 🔡 'positions' has 👉🏻 4 unique values

# for annotating patches on the bars of the plots
def patches(plot, feature,r):
    """
    Takes plot, feature & rotation as input
    and plots annotation for the plot

    """
    total = len(feature)
    for p in plot.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 2.
        y = p.get_y() + p.get_height() + 20
        ax.annotate(percentage, (x, y),ha='center', size = 12,rotation=r)
    plt.show()

    

Comparison of preferred foot over the different players

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Preferred Foot vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.style.use('dark_background')
plt.figure(figsize = (18, 8))
ax = sns.countplot(data['positions'],hue = data['preferred_foot'], palette = 'pink')
plt.title('Most Preferred Foot of the Players', fontsize = 20)
plt.xlabel('Preferred Foot', fontsize = 16)
plt.ylabel('Count of the Players', fontsize = 16)

patches(ax, data.positions,0)
				 📈📈📈📈📈 Distribution of Preferred Foot vs Players  📈📈📈📈📈

📝📝 The preffered foot by the most players is Right.

Representation of share of international reputation

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of International Reputation vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = sns.countplot(data['international_reputation'], palette = 'Greens')
plt.title('International Repuatation for the Football Players', fontsize = 20)
plt.xlabel('International Reputation', fontsize = 16)
plt.ylabel('Count of the Players', fontsize = 16)

patches(ax, data.international_reputation,0)
				 📈📈📈📈📈 Distribution of International Reputation vs Players  📈📈📈📈📈

📝📝 Around 92% of the players have international reputation as 1 and 6.2% have 2, making it around 98% of the total no. of players.

Different positions acquired by the players

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Positions vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = sns.countplot('positions', data = data, palette = 'pastel')
plt.xlabel('Different Positions in Football', fontsize = 16)
plt.ylabel('Count of Players', fontsize = 16)
plt.title('Comparison of Positions and Players', fontsize = 20)

patches(ax, data.positions,0)
				 📈📈📈📈📈 Distribution of Positions vs Players  📈📈📈📈📈

📝📝 Our dataset is divided into 4 major categories of player postions in which 19% of the players are Attackers, around 38% of the players are Midfielders, 32% are Defenders and 11% are Goalkeepers.

Comparing the players' Wages

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Wage vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = sns.histplot(data['wage_eur'], color = 'skyblue',bins=10)
plt.xlabel('Wage Range for Players', fontsize = 16)
plt.ylabel('Count of the Players', fontsize = 16)
plt.title('Distribution of Wages of Players', fontsize = 20)
plt.xticks(rotation = 90)

patches(ax, data.wage_eur,0)
				 📈📈📈📈📈 Distribution of Wage vs Players  📈📈📈📈📈

📝📝 98% of the players have wages less than 0.1 million Euros

Player with highest Wage

top_wage_player = data.loc[data.wage_eur == data['wage_eur'].max(),'short_name']
print('The player with the highest wage is', top_wage_player.values[0])
The player with the highest wage is L. Messi

Comparing the players' Valuation

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Valuation vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = sns.histplot(data['value_eur'], color = 'pink',bins=10)
plt.xlabel('Value Range for Players', fontsize = 16)
plt.ylabel('Count of the Players', fontsize = 16)
plt.title('Distribution of Value of Players', fontsize = 20)
plt.ticklabel_format(axis="x", style='plain')

patches(ax, data.value_eur,0)
				 📈📈📈📈📈 Distribution of Valuation vs Players  📈📈📈📈📈

📝📝 95% of the players have valuation under 10 million Euros and 3% players have more valuation between 10-20 million Euros

Most expensive player

most_expensive_player = data.loc[data.value_eur == data['value_eur'].max(),'short_name']
print('The most expensive player is', most_expensive_player.values[0])
The most expensive player is Neymar Jr

Age vs Valuations

# grouping the players on the basis of age
data['age_group'] = pd.cut(data['age'], bins = [data['age'].min(), 20, 25,30,35,40, data['age'].max()], 
                  labels=['20 and Under', '21 to 25', '26 to 30','31 to 35','36 to 40','Over 40'])

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Age vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = sns.stripplot(x = data['age_group'], y = data['value_eur'], palette = 'Reds')
plt.xlabel('Age Group of Players', fontsize = 16)
plt.ylabel('Valuation in EUR', fontsize = 16)
plt.title('Distribution of Valuation of players from different Age Groups', fontsize = 20)

plt.ticklabel_format(axis="y", style='plain')
plt.show()
				 📈📈📈📈📈 Distribution of Age vs Players  📈📈📈📈📈

# grouping the players on the basis of age
print('\nGrouping the players on the basis of age : \n\n'
,"\t",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

players_count = data.groupby(['age_group'])['short_name'].count()
players_count.to_frame().style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#139BB4'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
Grouping the players on the basis of age : 

 	 👇🏻👇🏻👇🏻

short_name
age_group
20 and Under 3127
21 to 25 6794
26 to 30 5562
31 to 35 2490
36 to 40 287
Over 40 6

📝📝 The valuation of the players increases with their age upto 30 years but after 30 years of age it starts decreasing. There are 5562 players who belong to the age group of 26-30 have the highest valuation.

Positions vs Valuations

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Valuation vs Positions "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = sns.stripplot(x = data['positions'], y = data['value_eur'], palette = 'Blues')
plt.xlabel('Positions of Players', fontsize = 16)
plt.ylabel('Valuation in EUR', fontsize = 16)
plt.title('Distribution of Valuation of players for different Positions', fontsize = 20)

plt.ticklabel_format(axis="y", style='plain')
plt.show()
				 📈📈📈📈📈 Distribution of Valuation vs Positions  📈📈📈📈📈

# grouping the players on the basis of positions
print('\nGrouping the players on the basis of positions : \n\n'
,"\t",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

positions_count = data.groupby(['positions'])['short_name'].count()
positions_count.to_frame().style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#436C3A'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
Grouping the players on the basis of positions : 

 	 👇🏻👇🏻👇🏻

short_name
positions
Attacker 3489
Defender 5878
Goalkeeper 2036
Midfielder 6875

📝📝 Some of the Attackers and Midfielders are highly valued.

Different skill moves of players

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Skill Moves vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = sns.countplot(x = 'skill_moves', data = data, palette = 'Oranges')
plt.title('Count of players on Basis of their skill moves', fontsize = 20)
plt.xlabel('Number of Skill Moves', fontsize = 16)
plt.ylabel('Count of Players', fontsize = 16)

patches(ax, data.skill_moves,0)
				 📈📈📈📈📈 Distribution of Skill Moves vs Players  📈📈📈📈📈

📝📝 47% of the players have 2 skill moves , around 36% of players have 3 skill moves.

Height of players

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Height vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = sns.countplot(x = 'height_cm', data = data, palette = 'dark')
plt.title('Count of players on Basis of Height', fontsize = 20)
plt.xlabel('Height in cm', fontsize = 16)
plt.ylabel('Count of Players', fontsize = 16)
plt.xticks(rotation = 90)

patches(ax, data.height_cm,90)  
				 📈📈📈📈📈 Distribution of Height vs Players  📈📈📈📈📈

📝📝 Almost 70% of players have height between 175cm to 188 cm .

Body weight of players

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Weight vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = sns.histplot(data['weight_kg'],kde=True, color = 'pink')
plt.title('Different Weights of the Players Participating in FIFA 2020', fontsize = 20)
plt.xlabel('Heights associated with the players', fontsize = 16)
plt.ylabel('Count of Players', fontsize = 16)

patches(ax, data.weight_kg,90)
				 📈📈📈📈📈 Distribution of Weight vs Players  📈📈📈📈📈

📝📝 60% of players have weight in the bracket of 70-80kg.

Work Rate of players

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Work Rate vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = sns.countplot(x = 'work_rate', data = data, palette = 'hls')
plt.title('Different work rates of the Players Participating in the FIFA 2020', fontsize = 20)
plt.xlabel('Work rates associated with the players', fontsize = 16)
plt.ylabel('Count of Players', fontsize = 16)

patches(ax, data.work_rate,0)
				 📈📈📈📈📈 Distribution of Work Rate vs Players  📈📈📈📈📈

📝📝 90% of the players have work rates as Medium/Medium.

Different potential scores of the players

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Potential vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = sns.histplot(data.potential, kde=True,bins = 50)
plt.xlabel("Player\'s Potential Scores", fontsize = 16)
plt.ylabel('Count of Players', fontsize = 16)
plt.title('Histogram of players Potential Scores', fontsize = 20)

patches(ax, data.potential,90)
				 📈📈📈📈📈 Distribution of Potential vs Players  📈📈📈📈📈

📝📝 Between 60-80 potential scores, 90% of players data lie.

Different overall scores of the players

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Overall vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = sns.histplot(data.overall,kde=True,bins = 50, color = 'm')
plt.xlabel('Overall score range', fontsize = 16)
plt.ylabel('Count of the Players',fontsize = 16)
plt.title('Histogram for the Overall Scores of the Players', fontsize = 20)

patches(ax, data.overall,90)
				 📈📈📈📈📈 Distribution of Overall vs Players  📈📈📈📈📈

📝📝 Between 50-80 overall scores, 90% of players data lie.

Age vs Overall Scores

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Overall vs Age "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

fig = plt.figure(figsize = (20, 10))
ax = sns.catplot(x = 'age', y = 'overall',col='positions', data = data,kind="box", palette = 'Reds')
ax.set_axis_labels(x_var="Age of Players", y_var="Overall Scores")
ax.set_xticklabels(rotation=90)
#ax.fig.suptitle('Distribution of player Overall ratings vs Age ',fontsize = 20)
plt.show()
				 📈📈📈📈📈 Distribution of Overall vs Age  📈📈📈📈📈

<Figure size 1440x720 with 0 Axes>

📝📝 With increase in age , overall scores usually increase .

Different nations participating in FIFA 2020

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Nations vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = data['nationality'].value_counts().head(100).plot.bar(color = 'orange', figsize = (20, 7))
plt.title('Different Nations Participating in FIFA 2020', fontsize = 30, fontweight = 20)
plt.xlabel('Name of The Countries',fontsize = 16)
plt.ylabel('Count of Players',fontsize = 16)

patches(ax, data.nationality,90)
				 📈📈📈📈📈 Distribution of Nations vs Players  📈📈📈📈📈

📝📝 England , Germany , Spain , France , Argentina , Brazil , Italy , Colombia – these 8 countries consist of 45% of overall data.

Countries with most players

print('\nNations with most players \n\n'
,"\t ",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

nationality = data['nationality'].value_counts().head(50)
nationality.to_frame().style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#636A92'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
Nations with most players 

 	  👇🏻👇🏻👇🏻

nationality
England 1667
Germany 1216
Spain 1035
France 984
Argentina 886
Brazil 824
Italy 732
Colombia 591
Japan 453
Netherlands 416
China PR 373
Chile 370
Sweden 358
Norway 350
Republic of Ireland 348
United States 347
Denmark 345
Portugal 344
Mexico 340
Poland 324
Korea Republic 322
Austria 319
Saudi Arabia 310
Turkey 294
Romania 287
Scotland 277
Belgium 268
Switzerland 229
Australia 196
Uruguay 164
Serbia 139
Ghana 130
Senegal 127
Croatia 126
Nigeria 126
Wales 117
Ivory Coast 105
Czech Republic 102
Greece 96
Morocco 94
Russia 81
Northern Ireland 81
Paraguay 80
Cameroon 78
South Africa 72
Finland 72
Ukraine 69
Venezuela 66
Bosnia Herzegovina 66
Canada 61

📝📝 Most of the players are from England, then Germany and then followed by Spain.

Every nation's player and their weights

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Nations vs Weight "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

top_10_countries = ('England', 'Germany', 'Spain', 'Argentina', 'France', 'Brazil', 'Italy', 'Columbia','Colombia','Japan','Netherlands')
data_countries = data.loc[data['nationality'].isin(top_10_countries) & data['weight_kg']]

plt.figure(figsize = (18, 8))
ax = sns.boxplot(x = data_countries['nationality'], y = data_countries['weight_kg'], palette = 'Reds')
ax.set_xlabel(xlabel = 'Countries', fontsize = 16)
ax.set_ylabel(ylabel = 'Weight in kg', fontsize = 16)
ax.set_title(label = 'Distribution of Weight of players from different countries', fontsize = 20)
plt.show()
				 📈📈📈📈📈 Distribution of Nations vs Weight  📈📈📈📈📈

📝📝 Mostly European countries have avg player weight at around 77 Kg , however for Asian countries like Japan it is close to 72 Kg.

Different Clubs associated with players

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Clubs vs Players "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize = (18, 8))
ax = data['club'].value_counts().head(100).plot.bar(color = 'white')
plt.title('Different clubs associated with players', fontsize = 30, fontweight = 20)
plt.xlabel('Name of The Clubs',fontsize = 16)
plt.ylabel('Count of Players',fontsize = 16)
plt.show()
				 📈📈📈📈📈 Distribution of Clubs vs Players  📈📈📈📈📈

📝📝 Some of the clubs have 33 players, few have 32 players, few have 31 players and most of the clubs have 30 players.

Popular clubs

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Distribution of Overall vs Popular Clubs "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

some_clubs = ('CD Leganés', 'Southampton', 'RC Celta', 'Empoli', 'Fortuna Düsseldorf', 'Manchestar City',
             'Tottenham Hotspur', 'FC Barcelona', 'Valencia CF', 'Chelsea', 'Real Madrid')

data_clubs = data.loc[data['club'].isin(some_clubs) & data['overall']]

plt.figure(figsize = (18, 8))
ax = sns.boxplot(x = data_clubs['club'], y = data_clubs['overall'], palette = 'inferno')
ax.set_xlabel(xlabel = 'Some Popular Clubs', fontsize = 16)
ax.set_ylabel(ylabel = 'Overall Score', fontsize = 16)
ax.set_title(label = 'Distribution of Overall Score in Different popular Clubs', fontsize = 20)
plt.xticks(rotation = 90)
plt.show()
				 📈📈📈📈📈 Distribution of Overall vs Popular Clubs  📈📈📈📈📈

📝📝 The average overall score of the popular clubs is between 70-85.

Clubs which pay highest wage

print('\nNo. of Clubs'
,emoji.emojize(":backhand_index_pointing_right_light_skin_tone:")
,data.club.nunique())
print("")
print('\nClubs\n'
,"\n"
,emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3
,"\n"
,"\n  ",data.club.unique())
No. of Clubs 👉🏻 698


Clubs
 
 👇🏻👇🏻👇🏻 
 
   ['FC Barcelona' 'Juventus' 'Paris Saint-Germain' 'Atlético Madrid'
 'Real Madrid' 'Manchester City' 'Liverpool' 'Napoli' 'Tottenham Hotspur'
 'Manchester United' 'Chelsea' 'FC Bayern München' 'Inter'
 'Borussia Dortmund' 'Arsenal' 'Valencia CF' 'Lazio' 'Milan' 'Sporting CP'
 'Olympique Lyonnais' 'RB Leipzig' 'Ajax' 'LA Galaxy' 'Atalanta'
 'RC Celta' 'Bayer 04 Leverkusen' 'Real Betis' 'FC Porto'
 'SV Werder Bremen' 'West Ham United' 'Wolverhampton Wanderers'
 'AS Saint-Étienne' 'Torino' 'Dalian YiFang FC' 'Borussia Mönchengladbach'
 'Roma' 'Guangzhou Evergrande Taobao FC' 'SL Benfica'
 'Medipol Başakşehir FK' 'Everton' 'VfL Wolfsburg' 'Crystal Palace'
 'Getafe CF' 'Shanghai SIPG FC' 'Eintracht Frankfurt'
 'Olympique de Marseille' 'Hertha BSC' 'RSC Anderlecht' 'Villarreal CF'
 'Sampdoria' 'Leicester City' 'AS Monaco' 'Jiangsu Suning FC'
 'Los Angeles FC' 'Cagliari' 'Sevilla FC' 'Fenerbahçe SK' 'Real Sociedad'
 'TSG 1899 Hoffenheim' 'Atlético Mineiro' 'Grêmio' 'PSV'
 'Athletic Club de Bilbao' 'Deportivo Alavés' 'Boca Juniors'
 'Lokomotiv Moscow' 'Al Nassr' 'Brescia' 'Shakhtar Donetsk'
 'Shanghai Greenland Shenhua FC' 'PFC CSKA Moscow'
 'Beijing Sinobo Guoan FC' 'Levante UD' 'Cruzeiro' 'Uruguay'
 'Montpellier HSC' 'Atlanta United' 'Watford' '1. FC Köln' 'Bournemouth'
 'Beşiktaş JK' 'Real Valladolid CF' 'Racing Club' 'Al Hilal'
 'Guangzhou R&F FC' 'Sassuolo' 'FC Girondins de Bordeaux' 'LOSC Lille'
 'Galatasaray SK' 'Chicago Fire' 'Fluminense' 'Ecuador' 'RCD Espanyol'
 'Dinamo Zagreb' 'FC Nantes' 'River Plate' 'OGC Nice' 'Newcastle United'
 'Brighton & Hove Albion' 'Club Brugge KV' 'FC Schalke 04' 'SD Eibar'
 'DC United' 'Orlando City SC' 'Hebei China Fortune FC' 'Tigres U.A.N.L.'
 'Aston Villa' 'Montreal Impact' 'Olympiacos CFP' 'Norwich City'
 'Feyenoord' 'Toronto FC' 'KRC Genk' 'Fiorentina' 'Spartak Moscow'
 'Dynamo Kyiv' 'SK Slavia Praha' 'Southampton' 'Burnley' 'SC Braga'
 'Russia' 'RC Strasbourg Alsace' 'Wuhan Zall' 'Vissel Kobe'
 'Portland Timbers' 'Genoa' 'Beijing Renhe FC' 'Toulouse Football Club'
 'Girona FC' 'Real Zaragoza' 'CD Leganés' 'Shenzhen FC' 'Internacional'
 'CSA - AL' 'Santos' '1. FSV Mainz 05' 'Stoke City' 'Udinese' 'Colombia'
 'Angers SCO' 'FC Augsburg' 'Netherlands' 'Fulham' 'FC København'
 'KAA Gent' 'SC Freiburg' 'Stade Rennais FC' 'Club América' 'Trabzonspor'
 'BSC Young Boys' 'Helsingborgs IF' 'Kaizer Chiefs' 'Parma' 'Mexico'
 'Royal Antwerp FC' 'Tianjin TEDA FC' 'Hannover 96' 'Tianjin Quanjian FC'
 'Al Ahli' 'Bologna' 'VfB Stuttgart' 'Seattle Sounders FC' 'Godoy Cruz'
 'Sparta Praha' 'Independiente Medellín' 'Sivasspor' 'Independiente'
 'Nîmes Olympique' 'Club Tijuana' 'SPAL' 'Monterrey' 'CD Tondela'
 'Fortuna Düsseldorf' 'Vitória Guimarães' 'AZ Alkmaar'
 'Atlético de San Luis' 'PAOK' 'Nagoya Grampus' 'Club Atlético Banfield'
 'Shandong Luneng TaiShan FC' 'New York Red Bulls' 'FC Red Bull Salzburg'
 'Sweden' 'Amiens SC' 'Hellas Verona' 'Cruz Azul' 'Gençlerbirliği SK'
 '1. FC Union Berlin' 'Standard de Liège'
 'Chongqing Dangdai Lifan FC SWM Team' 'New England Revolution'
 'Club Atlético Colón' 'Celtic' 'Club Atlas' 'Botafogo'
 'En Avant de Guingamp' 'West Bromwich Albion' 'Pachuca' 'AEK Athens'
 'Portimonense SC' 'Real Salt Lake' 'FC Utrecht' 'Sheffield United'
 "Newell's Old Boys" 'Club Atlético Talleres' 'Philadelphia Union'
 'Rosenborg BK' 'FC Basel 1893' 'Brentford' 'Club León'
 'Unión de Santa Fe' 'Deportivo de La Coruña' 'Rangers FC' 'Turkey'
 'Atiker Konyaspor' 'Granada CF' 'Perth Glory' 'Club Atlético Lanús'
 'Hamburger SV' 'Al Ittihad' 'Santos Laguna' 'Western United FC'
 'Columbus Crew SC' 'Deportivo Toluca' 'Cardiff City' 'CA Osasuna'
 'Swansea City' 'Melbourne Victory' 'Leeds United' 'Göztepe SK' 'Hungary'
 'New York City FC' 'Bulgaria' 'Kayserispor' 'Minnesota United FC'
 'Guadalajara' 'FC Groningen' 'Paraguay' 'Junior FC' 'Al Taawoun'
 'Huddersfield Town' 'Ettifaq FC' 'Stade de Reims' 'Rayo Vallecano'
 'San Lorenzo de Almagro' 'Bahia' 'Atlético Paranaense' 'Goiás' 'Avaí FC'
 'Fortaleza' 'Kawasaki Frontale' 'Vélez Sarsfield' 'Hull City'
 'Houston Dynamo' 'Birmingham City' 'Al Wehda' 'Nottingham Forest'
 'CD Tenerife' 'Aalborg BK' 'Preston North End' 'Bristol City' 'Lecce'
 'Gimnasia y Esgrima La Plata' 'CD Aves' 'Viktoria Plzeň' '1. FC Nürnberg'
 'Slovenia' 'FC Metz' 'FC Midtjylland' 'Molde FK' 'Colo-Colo'
 'RCD Mallorca' 'FC Sion' 'Wisła Kraków' 'Denizlispor' 'Middlesbrough'
 'Universidad Católica' 'Alanyaspor' 'Moreirense FC' "Côte d'Ivoire"
 'Stade Brestois 29' 'Chievo Verona' 'Boavista FC' 'MKE Ankaragücü'
 'Sydney FC' 'Al Ain FC' 'Yeni Malatyaspor' 'Urawa Red Diamonds'
 'Querétaro' 'Sporting Kansas City' 'Málaga CF' '1. FC Heidenheim 1846'
 'Atlético Tucumán' 'Gazişehir Gaziantep F.K.' 'Clube Sport Marítimo'
 'Chapecoense' 'Atlético Nacional' 'Vitesse' 'Australia' 'Henan Jianye FC'
 'Panathinaikos FC' 'Blackburn Rovers' 'Santa Clara' 'Cameroon'
 'Puebla FC' 'U.N.A.M.' 'SD Huesca' 'Estudiantes de La Plata' 'Austria'
 'KAS Eupen' 'LASK Linz' 'Sporting de Charleroi' 'Daegu FC'
 'Real Sporting de Gijón' 'Derby County' 'Rio Ave FC' 'South Africa'
 'Famalicão' 'Neuchâtel Xamax' 'Benevento' 'Cádiz CF' 'AIK'
 'Sheffield Wednesday' 'Empoli' 'Colorado Rapids' 'Os Belenenses'
 'Unión Magdalena' 'Real Oviedo' 'Peru' 'Antalyaspor' 'Dijon FCO'
 'SV Sandhausen' 'Rosario Central' 'Reading' 'DSC Arminia Bielefeld'
 'Malmö FF' 'Jeonbuk Hyundai Motors' 'Frosinone' 'FC Tokyo' 'Canada'
 'Çaykur Rizespor' 'FCSB (Steaua)' 'Defensa y Justicia' 'Monarcas Morelia'
 'Club Atlético Huracán' 'Ceará Sporting Club' 'Argentinos Juniors'
 'Al Shabab' 'Legia Warszawa' 'Shimizu S-Pulse' 'Millonarios FC'
 'Lechia Gdańsk' 'Brøndby IF' 'Albacete BP' 'FC Lorient'
 'Universitatea Craiova' 'Deportivo Cali' 'SK Rapid Wien'
 'Kashima Antlers' 'Poland' 'Elche CF' 'Club Atlético Aldosivi'
 'Deportes Tolima' 'Cúcuta Deportivo' 'Club Necaxa' 'Piast Gliwice'
 'Pescara' 'Kasimpaşa SK' 'Egypt' 'Holstein Kiel' 'Livorno'
 'Coquimbo Unido' 'UD Las Palmas' 'Górnik Zabrze' 'FC Twente' 'Paris FC'
 'Racing Club de Lens' 'VfL Bochum 1848' 'Sunderland' 'Aberdeen'
 'Heart of Midlothian' 'Romania' 'Crotone' 'Iceland' 'CFR Cluj'
 'FK Austria Wien' 'UD Almería' 'SK Brann' 'BK Häcken' 'Al Fateh'
 'CD Everton de Viña del Mar' 'FC St. Pauli' 'Gamba Osaka'
 'Yokohama F. Marinos' 'Djurgårdens IF' 'FC Dallas' 'Sagan Tosu'
 'Al Fayha' 'Chile' 'Once Caldas' 'Atlético Bucaramanga' 'FC Cincinnati'
 'San Jose Earthquakes' 'América de Cali' 'Perugia' 'La Equidad'
 'SV Darmstadt 98' 'FC Zürich' 'SC Paderborn 07' 'SV Zulte-Waregem'
 'IFK Norrköping' 'FC Viitorul' 'Pordenone' 'AD Alcorcón'
 'Sint-Truidense VV' 'KV Kortrijk' 'Royal Excel Mouscron' 'FC Lugano'
 'Hokkaido Consadole Sapporo' 'Spezia' 'FC Paços de Ferreira'
 'Servette FC' 'US Salernitana 1919' 'Lech Poznań' 'ESTAC Troyes'
 'Fortuna Sittard' 'KV Oostende' 'VVV-Venlo' 'KSV Cercle Brugge'
 'FC Seoul' 'Cerezo Osaka' 'Stade Malherbe Caen' 'Universidad de Chile'
 'Kalmar FF' 'KV Mechelen' 'Deportes Iquique' 'US Cremonese' 'Bolivia'
 'Valenciennes FC' 'Ulsan Hyundai FC' 'Vegalta Sendai' 'ADO Den Haag'
 'CD Palestino' 'Cittadella' 'Vitória de Setúbal' 'FC Nordsjælland'
 'Charlton Athletic' 'Al Raed' 'Jagiellonia Białystok' 'Odense Boldklub'
 'FC Thun' 'SG Dynamo Dresden' 'Hammarby IF' 'Central Córdoba'
 'Queens Park Rangers' ' SSV Jahn Regensburg'
 'Tiburones Rojos de Veracruz' 'Patronato' 'Virtus Entella'
 'SK Sturm Graz' 'Kilmarnock' 'Extremadura UD' 'Willem II'
 'Gil Vicente FC' 'Randers FC' 'Ascoli' 'Vancouver Whitecaps FC'
 'Millwall' 'Pohang Steelers' 'Suwon Samsung Bluewings' 'Heracles Almelo'
 'Al Hazem' 'FC Juárez' 'SC Heerenveen' 'Dinamo Bucureşti' 'Gyeongnam FC'
 'KFC Uerdingen 05' 'PEC Zwolle' 'Al Faisaly' 'Arsenal de Sarandí'
 'FC Ingolstadt 04' 'Wolfsberger AC' 'Wigan Athletic' 'Júbilo Iwata'
 'CD Lugo' 'SpVgg Greuther Fürth' 'FC Emmen' 'Cosenza' 'AJ Auxerre'
 'Cracovia' 'FC Erzgebirge Aue' 'Grenoble Foot 38' 'Korona Kielce'
 'Alianza Petrolera' 'CD Universidad de Concepción' 'Melbourne City FC'
 'CD Antofagasta' 'Independiente Santa Fe' 'Sanfrecce Hiroshima'
 'Damac FC' 'Chamois Niortais Football Club' 'Atlético Huila'
 'CD Numancia' 'Luton Town' 'Unión La Calera' 'Le Havre AC'
 'AS Nancy Lorraine' 'Audax Italiano' 'Hibernian' 'Barnsley'
 'Pogoń Szczecin' 'Sarpsborg 08 FF' 'Kristiansund BK' 'FC Luzern'
 'Unión Española' 'IFK Göteborg' 'Envigado FC' 'Jeju United FC'
 'Patriotas Boyacá FC' 'Peterborough United' 'Deportivo Pasto' 'Wales'
 'Clermont Foot 63' 'Northern Ireland' 'Astra Giurgiu' 'Racing Santander'
 'Pisa' 'Newcastle Jets' 'Castellammare di Stabia' 'Rionegro Águilas'
 'Vålerenga Fotball' "CD O'Higgins" 'AC Ajaccio' 'CF Fuenlabrada'
 'VfL Osnabrück' 'Arka Gdynia' 'FC Sochaux-Montbéliard'
 'SpVgg Unterhaching' 'Central Coast Mariners' 'SV Wehen Wiesbaden'
 'Fleetwood Town' 'Venezia FC' 'Karlsruher SC' 'Wisła Płock' 'Gangwon FC'
 'IF Elfsborg' 'Sangju Sangmu FC' 'Zagłębie Lubin' 'Abha Club'
 'Jaguares de Córdoba' 'FC Botoşani' 'Motherwell' 'FC St. Gallen'
 'Bayern München II' 'CD Mirandés' 'St. Johnstone FC' 'Lillestrøm SK'
 'Odds BK' 'SD Ponferradina' 'Doncaster Rovers' 'Örebro SK'
 'Shamrock Rovers' 'FSV Zwickau' '1. FC Kaiserslautern' 'Portsmouth'
 'Waasland-Beveren' 'Ipswich Town' 'Dundalk' 'CD Huachipato' 'Le Mans FC'
 'Strømsgodset IF' 'Sepsi OSK' 'Sparta Rotterdam'
 'FC Admira Wacker Mödling' 'SønderjyskE' 'Hallescher FC' 'SKN St. Pölten'
 'MSV Duisburg' 'La Berrichonne de Châteauroux' 'FC Hansa Rostock'
 'Incheon United FC' 'Shonan Bellmare' 'Orlando Pirates' 'FK Bodø/Glimt'
 'Lincoln City' 'Brisbane Roar' 'Eintracht Braunschweig' 'Curicó Unido'
 'Falkenbergs FF' 'Venezuela' '1. FC Magdeburg' 'CD Cobresal' 'Aarhus GF'
 'Salford City' 'HJK Helsinki' 'Oxford United' 'Hobro IK' 'Esbjerg fB'
 'Bolton Wanderers' 'SV Mattersburg' 'Rotherham United' 'SCR Altach'
 'Gaz Metan Mediaş' 'Tromsø IL' 'WSG Tirol' 'IK Sirius' 'Shrewsbury'
 'Oldham Athletic' 'Southend United' 'Blackpool' 'Coventry City'
 'Adelaide United' 'Ranheim Fotball' 'SG Sonnenhof Großaspach'
 'Matsumoto Yamaga' 'Burton Albion' 'FC Hermannstadt' 'TSV 1860 München'
 'LKS Lodz' 'FC Würzburger Kickers' 'TSV Hartberg' 'Politehnica Iaşi'
 'Wellington Phoenix' 'Milton Keynes Dons' 'Stabæk Fotball'
 'Wycombe Wanderers' 'SV Waldhof Mannheim' 'AC Horsens'
 'Scunthorpe United' 'RKC Waalwijk' 'Oita Trinita' 'St. Mirren'
 'Bristol Rovers' 'Rodez Aveyron Football' 'SV Meppen' 'Viking FK'
 'Östersunds FK' 'FK Haugesund' 'Rochdale' 'Colchester United' 'Trapani'
 'Stevenage' 'Bradford City' 'Livingston FC' 'FC Chambly Oise'
 'Mansfield Town' 'SC Preußen Münster' 'FC Voluntari' 'Accrington Stanley'
 'Al Adalah' 'Lyngby BK' 'FC Carl Zeiss Jena' 'Viktoria Köln'
 'Tranmere Rovers' 'Silkeborg IF' 'Gillingham' 'Plymouth Argyle'
 'Chemnitzer FC' 'Mjøndalen IF' 'Walsall' 'Northampton Town'
 'Hamilton Academical FC' 'Grimsby Town' 'Exeter City' 'Swindon Town'
 'Raków Częstochowa' 'Chindia Târgovişte' 'US Orléans Loiret Football'
 'Forest Green Rovers' 'AFC Wimbledon' 'Carlisle United' 'Morecambe'
 'Port Vale' 'Cheltenham Town' 'Academica Clinceni' 'Crawley Town'
 'Ross County FC' 'AFC Eskilstuna' 'Macclesfield Town' 'Cork City'
 'Newport County' 'Crewe Alexandra' 'Leyton Orient' 'Cambridge United'
 "St. Patrick's Athletic" 'Bohemian FC' 'India' 'Finland' 'Waterford FC'
 'Derry City' 'Bury' 'New Zealand' 'GIF Sundsvall' 'Sligo Rovers'
 'Finn Harps' 'Seongnam FC' 'UCD AFC' 'Śląsk Wrocław']

📝📝 There are 698 different clubs.

clubs = data.groupby(['club'])

print("\nNations with their maximum wage\n\n"
,"\t\t",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

max_wage = clubs[['wage_eur','nationality']].max()
max_wage.head(100).style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#053975'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
Nations with their maximum wage

 		 👇🏻👇🏻👇🏻

wage_eur nationality
club
SSV Jahn Regensburg 12000 Portugal
1. FC Heidenheim 1846 15000 Germany
1. FC Kaiserslautern 3000 Namibia
1. FC Köln 36000 United States
1. FC Magdeburg 4000 Ghana
1. FC Nürnberg 15000 Sweden
1. FC Union Berlin 35000 Turkey
1. FSV Mainz 05 30000 Switzerland
AC Ajaccio 4000 United States
AC Horsens 4000 United States
AD Alcorcón 8000 Venezuela
ADO Den Haag 8000 Suriname
AEK Athens 1000 Ukraine
AFC Eskilstuna 1000 Ukraine
AFC Wimbledon 3000 Wales
AIK 10000 Sweden
AJ Auxerre 6000 Serbia
AS Monaco 67000 Switzerland
AS Nancy Lorraine 4000 Senegal
AS Saint-Étienne 43000 Tunisia
AZ Alkmaar 12000 Norway
Aalborg BK 13000 Sweden
Aarhus GF 7000 Sweden
Aberdeen 6000 Wales
Abha Club 11000 Tunisia
Academica Clinceni 2000 Slovakia
Accrington Stanley 3000 Republic of Ireland
Adelaide United 2000 Norway
Ajax 39000 United States
Al Adalah 8000 Tunisia
Al Ahli 47000 Syria
Al Ain FC 1000 United Arab Emirates
Al Faisaly 17000 Trinidad & Tobago
Al Fateh 21000 Uruguay
Al Fayha 20000 Saudi Arabia
Al Hazem 10000 Saudi Arabia
Al Hilal 56000 Syria
Al Ittihad 39000 Serbia
Al Nassr 59000 Saudi Arabia
Al Raed 18000 Saudi Arabia
Al Shabab 29000 Tunisia
Al Taawoun 27000 Saudi Arabia
Al Wehda 25000 Turkey
Alanyaspor 16000 Turkey
Albacete BP 9000 Uruguay
Alianza Petrolera 2000 Guatemala
Amiens SC 20000 Sweden
América de Cali 3000 Colombia
Angers SCO 23000 Spain
Antalyaspor 16000 Turkey
Argentinos Juniors 12000 Uruguay
Arka Gdynia 3000 Spain
Arsenal 205000 Wales
Arsenal de Sarandí 8000 United States
Ascoli 4000 Tanzania
Aston Villa 61000 Zimbabwe
Astra Giurgiu 6000 Romania
Atalanta 92000 Ukraine
Athletic Club de Bilbao 36000 Spain
Atiker Konyaspor 18000 Ukraine
Atlanta United 14000 Venezuela
Atlético Bucaramanga 2000 Uruguay
Atlético Huila 1000 Venezuela
Atlético Madrid 125000 Venezuela
Atlético Mineiro 60000 Brazil
Atlético Nacional 5000 Uruguay
Atlético Paranaense 15000 Brazil
Atlético Tucumán 13000 Switzerland
Atlético de San Luis 16000 Uruguay
Audax Italiano 4000 Venezuela
Australia 0 Australia
Austria 0 Austria
Avaí FC 16000 Brazil
BK Häcken 6000 Sweden
BSC Young Boys 26000 Switzerland
Bahia 13000 Brazil
Barnsley 7000 Wales
Bayer 04 Leverkusen 89000 Poland
Bayern München II 3000 United States
Beijing Renhe FC 19000 Senegal
Beijing Sinobo Guoan FC 34000 Korea Republic
Benevento 5000 Sweden
Beşiktaş JK 80000 United States
Birmingham City 13000 Sweden
Blackburn Rovers 20000 Republic of Ireland
Blackpool 4000 Tanzania
Boavista FC 7000 Spain
Boca Juniors 31000 Venezuela
Bohemian FC 1000 Scotland
Bolivia 0 Bolivia
Bologna 36000 Uruguay
Bolton Wanderers 3000 Romania
Borussia Dortmund 170000 Switzerland
Borussia Mönchengladbach 41000 United States
Botafogo 26000 Brazil
Bournemouth 81000 Wales
Bradford City 8000 Wales
Brentford 40000 Sweden
Brescia 27000 Venezuela
Brighton & Hove Albion 53000 Spain

print("\nClubs with their maximum wage\n\n"
,"\t\t",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

high_wage_clubs = max_wage['wage_eur'].sort_values(ascending=False).head(50)
high_wage_clubs.to_frame().style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#261655'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
Clubs with their maximum wage

 		 👇🏻👇🏻👇🏻

wage_eur
club
FC Barcelona 565000
Real Madrid 470000
Juventus 405000
Manchester City 370000
Paris Saint-Germain 290000
Manchester United 250000
Liverpool 240000
Chelsea 235000
FC Bayern München 235000
Tottenham Hotspur 220000
Arsenal 205000
Borussia Dortmund 170000
Napoli 150000
Inter 135000
Atlético Madrid 125000
West Ham United 125000
Wolverhampton Wanderers 125000
Everton 120000
Leicester City 115000
Olympique Lyonnais 110000
Lazio 105000
Fenerbahçe SK 105000
Atalanta 92000
Crystal Palace 89000
Bayer 04 Leverkusen 89000
Tigres U.A.N.L. 88000
Bournemouth 81000
Beşiktaş JK 80000
Watford 78000
Galatasaray SK 77000
RB Leipzig 77000
Roma 75000
Valencia CF 69000
Medipol Başakşehir FK 67000
AS Monaco 67000
Fiorentina 63000
Torino 63000
VfL Wolfsburg 62000
TSG 1899 Hoffenheim 61000
Aston Villa 61000
Atlético Mineiro 60000
Olympique de Marseille 60000
Al Nassr 59000
Leeds United 57000
Al Hilal 56000
Milan 56000
Brighton & Hove Albion 53000
Norwich City 53000
Southampton 53000
Celtic 52000

📝📝 The highest wage is given by FC Barcelona followed by Real Madrid and Juventus.

Data Preprocessing

# selecting only player's name and all numeric features
data = data[['short_name','positions','nationality','club','age', 'height_cm', 'weight_kg', 'overall', 'potential',
'value_eur', 'wage_eur', 'international_reputation', 'weak_foot',
'skill_moves', 'release_clause_eur',
'contract_valid_until', 'pace', 'shooting',
'passing', 'dribbling', 'defending', 'physic', 'gk_diving',
'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed',
'gk_positioning', 'attacking_crossing', 'attacking_finishing',
'attacking_heading_accuracy', 'attacking_short_passing',
'attacking_volleys', 'skill_dribbling', 'skill_curve',
'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
'movement_reactions', 'movement_balance', 'power_shot_power',
'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
'mentality_aggression', 'mentality_interceptions',
'mentality_positioning', 'mentality_vision', 'mentality_penalties',
'mentality_composure', 'defending_marking', 'defending_standing_tackle',
'defending_sliding_tackle', 'goalkeeping_diving',
'goalkeeping_handling', 'goalkeeping_kicking',
'goalkeeping_positioning', 'goalkeeping_reflexes']]

📝📝 We have selected 62 columns out of which 58 are numerical and 4 are categorical.

# Exctracting players whose overall>80
data = data[data.overall > 80]

data.head(10).style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#C70039'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
).highlight_null(null_color='#CCB3C5').highlight_max(color='#0074FF',axis=0).highlight_min(color='#00FFE5',axis=0)
short_name positions nationality club age height_cm weight_kg overall potential value_eur wage_eur international_reputation weak_foot skill_moves release_clause_eur contract_valid_until pace shooting passing dribbling defending physic gk_diving gk_handling gk_kicking gk_reflexes gk_speed gk_positioning attacking_crossing attacking_finishing attacking_heading_accuracy attacking_short_passing attacking_volleys skill_dribbling skill_curve skill_fk_accuracy skill_long_passing skill_ball_control movement_acceleration movement_sprint_speed movement_agility movement_reactions movement_balance power_shot_power power_jumping power_stamina power_strength power_long_shots mentality_aggression mentality_interceptions mentality_positioning mentality_vision mentality_penalties mentality_composure defending_marking defending_standing_tackle defending_sliding_tackle goalkeeping_diving goalkeeping_handling goalkeeping_kicking goalkeeping_positioning goalkeeping_reflexes
0 L. Messi Attacker Argentina FC Barcelona 32 170 72 94 94 95500000 565000 5 4 4 195800000.0 2021.0 87.0 92.0 92.0 96.0 39.0 66.0 nan nan nan nan nan nan 88 95 70 92 88 97 93 94 92 96 91 84 93 95 95 86 68 75 68 94 48 40 94 94 75 96 33 37 26 6 11 15 14 8
1 Cristiano Ronaldo Attacker Portugal Juventus 34 187 83 93 93 58500000 405000 5 4 5 96500000.0 2022.0 90.0 93.0 82.0 89.0 35.0 78.0 nan nan nan nan nan nan 84 94 89 83 87 89 81 76 77 92 89 91 87 96 71 95 95 85 78 93 63 29 95 82 85 95 28 32 24 7 11 15 14 11
2 Neymar Jr Attacker Brazil Paris Saint-Germain 27 175 68 92 92 105500000 290000 5 5 5 195200000.0 2022.0 91.0 85.0 87.0 95.0 32.0 58.0 nan nan nan nan nan nan 87 87 62 87 87 96 88 87 81 95 94 89 96 92 84 80 61 81 49 84 51 36 87 90 90 94 27 26 29 9 9 15 15 11
3 J. Oblak Goalkeeper Slovenia Atlético Madrid 26 188 87 91 93 77500000 125000 3 3 1 164700000.0 2023.0 nan nan nan nan nan nan 87.0 92.0 78.0 89.0 52.0 90.0 13 11 15 43 13 12 13 14 40 30 43 60 67 88 49 59 78 41 78 12 34 19 11 65 11 68 27 12 18 87 92 78 90 89
4 E. Hazard Attacker Belgium Real Madrid 28 175 74 91 91 90000000 470000 4 4 4 184500000.0 2024.0 91.0 83.0 86.0 94.0 35.0 66.0 nan nan nan nan nan nan 81 84 61 89 83 95 83 79 83 94 94 88 95 90 94 82 56 84 63 80 54 41 87 89 88 91 34 27 22 11 12 6 8 8
5 K. De Bruyne Midfielder Belgium Manchester City 28 181 70 91 91 90000000 370000 4 5 4 166500000.0 2023.0 76.0 86.0 92.0 86.0 61.0 78.0 nan nan nan nan nan nan 93 82 55 92 82 86 85 83 91 91 77 76 78 91 76 91 63 89 74 90 76 61 88 94 79 91 68 58 51 15 13 5 10 13
6 M. ter Stegen Goalkeeper Germany FC Barcelona 27 187 85 90 93 67500000 250000 3 4 1 143400000.0 2022.0 nan nan nan nan nan nan 88.0 85.0 88.0 90.0 45.0 88.0 18 14 11 61 14 21 18 12 63 30 38 50 37 86 43 66 79 35 78 10 43 22 11 70 25 70 25 13 10 88 85 88 88 90
7 V. van Dijk Defender Netherlands Liverpool 27 193 92 90 91 78000000 200000 3 3 2 150200000.0 2023.0 77.0 60.0 70.0 71.0 90.0 86.0 nan nan nan nan nan nan 53 52 86 78 45 70 60 70 81 76 74 79 61 88 53 81 90 75 92 64 82 89 47 65 62 89 91 92 85 13 10 13 11 11
8 L. Modrić Midfielder Croatia Real Madrid 33 172 66 90 90 45000000 340000 4 4 4 92300000.0 2020.0 74.0 76.0 89.0 89.0 72.0 66.0 nan nan nan nan nan nan 86 72 55 92 76 87 85 78 88 92 77 71 92 89 93 79 68 85 58 82 62 82 79 91 82 92 68 76 71 13 9 7 14 9
9 M. Salah Attacker Egypt Liverpool 27 175 71 90 90 80500000 240000 3 3 4 148900000.0 2023.0 93.0 86.0 81.0 89.0 45.0 74.0 nan nan nan nan nan nan 79 90 59 84 79 89 83 69 75 89 94 92 91 92 88 80 69 85 73 84 63 55 92 84 77 91 38 43 41 14 14 9 11 14

📝📝 We do not want to use 18000+ players to group so we have extracted players whose overall score is above 80 and these are 411 players.

# missing values
print('\nMissing Values in each column of the data : \n\n'
,"\t\t",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

missing = data.isnull().sum().sort_values(ascending=False)
missing = missing.head(50)
missing = missing.to_frame()
missing.columns = ['missing_values']
missing.style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#A15F86'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
Missing Values in each column of the data : 

 		 👇🏻👇🏻👇🏻

missing_values
gk_positioning 357
gk_speed 357
gk_reflexes 357
gk_kicking 357
gk_handling 357
gk_diving 357
physic 54
defending 54
dribbling 54
passing 54
shooting 54
pace 54
release_clause_eur 14
contract_valid_until 6
attacking_crossing 0
goalkeeping_reflexes 0
goalkeeping_positioning 0
skill_moves 0
weak_foot 0
international_reputation 0
wage_eur 0
value_eur 0
potential 0
overall 0
weight_kg 0
height_cm 0
age 0
club 0
nationality 0
positions 0
attacking_finishing 0
attacking_heading_accuracy 0
attacking_short_passing 0
attacking_volleys 0
goalkeeping_kicking 0
goalkeeping_handling 0
goalkeeping_diving 0
defending_sliding_tackle 0
defending_standing_tackle 0
defending_marking 0
mentality_composure 0
mentality_penalties 0
mentality_vision 0
mentality_positioning 0
mentality_interceptions 0
mentality_aggression 0
power_long_shots 0
power_strength 0
power_stamina 0
power_jumping 0

📝📝 There are few missing values in the extracted data and are only in the numerical features.

# replacing null with the mean
data = data.fillna(data.mean())

# after handling missing values
print('\nAfter Imputing Missing Values in the data : \n\n'
,"\t\t",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

missing = data.isnull().sum().sort_values(ascending=False)
missing = missing.head(50)
missing = missing.to_frame()
missing.columns = ['missing_values']
missing.style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#932A06'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
After Imputing Missing Values in the data : 

 		 👇🏻👇🏻👇🏻

missing_values
goalkeeping_reflexes 0
contract_valid_until 0
attacking_crossing 0
gk_positioning 0
gk_speed 0
gk_reflexes 0
gk_kicking 0
gk_handling 0
gk_diving 0
physic 0
defending 0
dribbling 0
passing 0
shooting 0
pace 0
release_clause_eur 0
goalkeeping_positioning 0
skill_moves 0
weak_foot 0
international_reputation 0
wage_eur 0
value_eur 0
potential 0
overall 0
weight_kg 0
height_cm 0
age 0
club 0
nationality 0
positions 0
attacking_finishing 0
attacking_heading_accuracy 0
attacking_short_passing 0
attacking_volleys 0
goalkeeping_kicking 0
goalkeeping_handling 0
goalkeeping_diving 0
defending_sliding_tackle 0
defending_standing_tackle 0
defending_marking 0
mentality_composure 0
mentality_penalties 0
mentality_vision 0
mentality_positioning 0
mentality_interceptions 0
mentality_aggression 0
power_long_shots 0
power_strength 0
power_stamina 0
power_jumping 0

📝📝 We have repalced the missing values with the corresponding mean of the feature and now there are no missing values in the dataset.

# saving for later
names = data.short_name.tolist()
positions = data.positions.tolist()
club = data.club.tolist()
nationality = data.nationality.tolist()

# dropping the short_name column
data = data.drop(['short_name','positions','club','nationality'],axis=1)

📝📝 We have saved the categorical features in a list for later use and dropped from the dataset.

data.head(10).style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#68524B'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
).highlight_max(color='#0074FF',axis=0).highlight_min(color='#00FFE5',axis=0)
age height_cm weight_kg overall potential value_eur wage_eur international_reputation weak_foot skill_moves release_clause_eur contract_valid_until pace shooting passing dribbling defending physic gk_diving gk_handling gk_kicking gk_reflexes gk_speed gk_positioning attacking_crossing attacking_finishing attacking_heading_accuracy attacking_short_passing attacking_volleys skill_dribbling skill_curve skill_fk_accuracy skill_long_passing skill_ball_control movement_acceleration movement_sprint_speed movement_agility movement_reactions movement_balance power_shot_power power_jumping power_stamina power_strength power_long_shots mentality_aggression mentality_interceptions mentality_positioning mentality_vision mentality_penalties mentality_composure defending_marking defending_standing_tackle defending_sliding_tackle goalkeeping_diving goalkeeping_handling goalkeeping_kicking goalkeeping_positioning goalkeeping_reflexes
0 32 170 72 94 94 95500000 565000 5 4 4 195800000.0 2021.0 87.0 92.0 92.0 96.0 39.0 66.0 83.6 81.1 76.6 85.3 49.7 82.4 88 95 70 92 88 97 93 94 92 96 91 84 93 95 95 86 68 75 68 94 48 40 94 94 75 96 33 37 26 6 11 15 14 8
1 34 187 83 93 93 58500000 405000 5 4 5 96500000.0 2022.0 90.0 93.0 82.0 89.0 35.0 78.0 83.6 81.1 76.6 85.3 49.7 82.4 84 94 89 83 87 89 81 76 77 92 89 91 87 96 71 95 95 85 78 93 63 29 95 82 85 95 28 32 24 7 11 15 14 11
2 27 175 68 92 92 105500000 290000 5 5 5 195200000.0 2022.0 91.0 85.0 87.0 95.0 32.0 58.0 83.6 81.1 76.6 85.3 49.7 82.4 87 87 62 87 87 96 88 87 81 95 94 89 96 92 84 80 61 81 49 84 51 36 87 90 90 94 27 26 29 9 9 15 15 11
3 26 188 87 91 93 77500000 125000 3 3 1 164700000.0 2023.0 73.8 70.3 74.8 78.5 62.5 73.1 87.0 92.0 78.0 89.0 52.0 90.0 13 11 15 43 13 12 13 14 40 30 43 60 67 88 49 59 78 41 78 12 34 19 11 65 11 68 27 12 18 87 92 78 90 89
4 28 175 74 91 91 90000000 470000 4 4 4 184500000.0 2024.0 91.0 83.0 86.0 94.0 35.0 66.0 83.6 81.1 76.6 85.3 49.7 82.4 81 84 61 89 83 95 83 79 83 94 94 88 95 90 94 82 56 84 63 80 54 41 87 89 88 91 34 27 22 11 12 6 8 8
5 28 181 70 91 91 90000000 370000 4 5 4 166500000.0 2023.0 76.0 86.0 92.0 86.0 61.0 78.0 83.6 81.1 76.6 85.3 49.7 82.4 93 82 55 92 82 86 85 83 91 91 77 76 78 91 76 91 63 89 74 90 76 61 88 94 79 91 68 58 51 15 13 5 10 13
6 27 187 85 90 93 67500000 250000 3 4 1 143400000.0 2022.0 73.8 70.3 74.8 78.5 62.5 73.1 88.0 85.0 88.0 90.0 45.0 88.0 18 14 11 61 14 21 18 12 63 30 38 50 37 86 43 66 79 35 78 10 43 22 11 70 25 70 25 13 10 88 85 88 88 90
7 27 193 92 90 91 78000000 200000 3 3 2 150200000.0 2023.0 77.0 60.0 70.0 71.0 90.0 86.0 83.6 81.1 76.6 85.3 49.7 82.4 53 52 86 78 45 70 60 70 81 76 74 79 61 88 53 81 90 75 92 64 82 89 47 65 62 89 91 92 85 13 10 13 11 11
8 33 172 66 90 90 45000000 340000 4 4 4 92300000.0 2020.0 74.0 76.0 89.0 89.0 72.0 66.0 83.6 81.1 76.6 85.3 49.7 82.4 86 72 55 92 76 87 85 78 88 92 77 71 92 89 93 79 68 85 58 82 62 82 79 91 82 92 68 76 71 13 9 7 14 9
9 27 175 71 90 90 80500000 240000 3 3 4 148900000.0 2023.0 93.0 86.0 81.0 89.0 45.0 74.0 83.6 81.1 76.6 85.3 49.7 82.4 79 90 59 84 79 89 83 69 75 89 94 92 91 92 88 80 69 85 73 84 63 55 92 84 77 91 38 43 41 14 14 9 11 14

Clustering

# standardizing the data
def std_data(dataset):
    x = dataset.values
    scaler = preprocessing.StandardScaler()
    x_scaled = scaler.fit_transform(x)
    std_data.x_standardized = pd.DataFrame(x_scaled)

# Using PCA
def pca_data():
    pca = PCA(n_components = 2)
    pca_data.x_std = pd.DataFrame(pca.fit_transform(std_data.x_standardized))
    

📝📝 We have to standardize the data as variables are measured on different scales. Then we will use PCA to reduce 62 dimensions into 2 for plot.

1. K-Means Clustering

# K-Means Clustering
def kmeans_clustering(dataset,n):
    # specify the number of clusters
    kmeans = KMeans(n_clusters=n,max_iter=600)
    # fit the input data
    kmeans = kmeans.fit(pca_data.x_std)
    # get the cluster labels
    labels = kmeans.predict(pca_data.x_std)
    # centroid values
    centroids = kmeans.cluster_centers_
    # cluster values
    clusters = kmeans.labels_.tolist()
    # making a new dataframe by adding players' names,positions,club,nationality and their clusters
    kmeans_clustering.reduced = pca_data.x_std.copy()
    kmeans_clustering.reduced['cluster'] = clusters
    kmeans_clustering.reduced['name'] = names
    kmeans_clustering.reduced['positions'] = positions
    kmeans_clustering.reduced['club'] = club
    kmeans_clustering.reduced['nationality'] = nationality
    kmeans_clustering.reduced.columns = ['x', 'y', 'cluster', 'name','positions','club','nationality']
    
    ax = sns.lmplot(x="x", y="y", hue='cluster', data = kmeans_clustering.reduced, legend=False,palette = 'hls',fit_reg=False, size = 15)
    ax = plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=300,c='white', label='centroids')

    plt.legend()
    plt.ylim=(-10, 10)
    plt.title("K-Means Clustering",fontsize = 30)
    plt.tick_params(labelsize=15)
    plt.xlabel("PC 1", fontsize = 20)
    plt.ylabel("PC 2", fontsize = 20)
    
    
    plt.show()
    

# performing standardization
std_data(data)

# performing PCA
pca_data()

print()
print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Plotting K-Means Clustering "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

# performing k-means clustering and specifying 5 clusters
kmeans_clustering(data,5)
				 📈📈📈📈📈 Plotting K-Means Clustering  📈📈📈📈📈

📝📝 Clusters are formed based on the players positions. The above graph shows yhe scatter plot of the data colored by the cluster they belong to. The symbol '*' is the centroid of each cluster.

#No. of Positions in each Cluster
print('\nNo. of Positions in each Cluster : \n\n'
,"\t\t",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

cluster_players = kmeans_clustering.reduced.groupby(['cluster'])
players_count = cluster_players['positions'].count()
players_count.to_frame().style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#874E8A'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
No. of Positions in each Cluster : 

 		 👇🏻👇🏻👇🏻

positions
cluster
0 116
1 54
2 78
3 52
4 111

📝📝 In first cluster there are 60 players, in second 101 players, in third 54 players, in fourth 114 players and in last 82 players.

cluster_positions = kmeans_clustering.reduced.groupby(['cluster','positions'])

#Count of every Position in each Cluster
print('\nCount of every position in each Cluster : \n\n'
,"\t\t",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

positions_count = cluster_positions['positions'].count()
positions_count = positions_count.to_frame()
positions_count.style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#5E5373'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
Count of every position in each Cluster : 

 		 👇🏻👇🏻👇🏻

positions
cluster positions
0 Attacker 62
Midfielder 54
1 Goalkeeper 54
2 Attacker 10
Defender 31
Midfielder 37
3 Defender 51
Midfielder 1
4 Attacker 35
Defender 12
Midfielder 64

📝📝 The third cluster contains only 54 Goalkeepers and other 4 clusters contain different no. of Attackers, Defenders and Midfielders.

#Nationwise Positions of the Players within each Cluster
print('\nNationwise Positions of the Players within each Cluster : \n\n'
,"\t\t",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

cluster_nations = kmeans_clustering.reduced.groupby(['cluster','positions','nationality'])
nations_count = cluster_nations['nationality'].count()
nations_count = nations_count.to_frame()
nations_count.style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#5E83A8'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
Nationwise Positions of the Players within each Cluster : 

 		 👇🏻👇🏻👇🏻

nationality
cluster positions nationality
0 Attacker Algeria 1
Argentina 4
Belgium 2
Brazil 7
Chile 1
Croatia 1
Egypt 1
England 2
France 8
Gabon 1
Germany 4
Italy 4
Ivory Coast 2
Jamaica 1
Korea Republic 1
Mexico 2
Netherlands 3
Poland 1
Portugal 1
Senegal 1
Serbia 2
Slovenia 1
Spain 8
Switzerland 1
Uruguay 1
Venezuela 1
Midfielder Argentina 3
Belgium 3
Bosnia Herzegovina 1
Brazil 6
Colombia 1
Croatia 1
Denmark 1
England 1
France 4
Germany 7
Italy 2
Mexico 1
Morocco 1
Portugal 7
Scotland 1
Serbia 1
Spain 9
Sweden 1
Ukraine 1
Uruguay 2
1 Goalkeeper Albania 1
Argentina 1
Belgium 2
Brazil 4
Cameroon 1
Costa Rica 1
Czech Republic 2
Denmark 1
England 1
Finland 1
France 4
Germany 6
Hungary 1
Italy 7
Netherlands 1
Norway 1
Poland 2
Portugal 2
Slovenia 2
Spain 10
Switzerland 2
Uruguay 1
2 Attacker Colombia 1
Croatia 1
Denmark 1
France 2
Netherlands 3
Spain 2
Defender Argentina 1
Belgium 3
Brazil 5
England 2
France 4
Germany 2
Italy 1
Netherlands 3
Poland 1
Portugal 1
Russia 1
Serbia 1
Spain 5
Sweden 1
Midfielder Argentina 1
Belgium 2
Brazil 8
Central African Rep. 1
Chile 1
Denmark 1
England 2
France 4
Germany 2
Ghana 1
Italy 2
Mexico 1
Nigeria 1
Portugal 2
Senegal 1
Serbia 2
Spain 4
Switzerland 1
3 Defender Argentina 2
Belgium 1
Brazil 6
Cameroon 2
Colombia 1
Croatia 1
Denmark 1
England 2
France 5
Germany 6
Greece 2
Hungary 1
Italy 5
Montenegro 1
Netherlands 1
Portugal 2
Senegal 1
Slovakia 1
Spain 3
Switzerland 1
Togo 1
Uruguay 5
Midfielder Spain 1
4 Attacker Argentina 5
Austria 1
Belgium 1
Bosnia Herzegovina 1
Brazil 3
Colombia 1
DR Congo 1
Ecuador 1
England 2
France 3
Germany 1
Israel 1
Italy 4
Poland 1
Spain 5
Sweden 1
Uruguay 2
Wales 1
Defender Austria 1
Brazil 2
England 1
France 1
Portugal 3
Scotland 1
Spain 3
Midfielder Argentina 3
Armenia 1
Austria 1
Belgium 1
Bosnia Herzegovina 1
Brazil 6
Chile 1
Colombia 1
Croatia 4
England 2
France 5
Germany 6
Guinea 1
Iceland 1
Italy 3
Mexico 1
Netherlands 3
Poland 1
Portugal 4
Serbia 2
Slovakia 1
Slovenia 1
Spain 12
Uruguay 1
Wales 1

📝📝 From above we can see that which countries players are grouped together on the basis of their positions.

cluster_nations['nationality']
<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f87fa8c75d0>

#Clubwise Positions of the Players within each Cluster
print('\nClubwise Positions of the Players within each Cluster : \n\n'
,"\t\t",emoji.emojize(":backhand_index_pointing_down_light_skin_tone:")*3)
print("")

cluster_clubs = kmeans_clustering.reduced.groupby(['cluster','positions','club'])
clubs_count = cluster_clubs['club'].count()
clubs_count = clubs_count.to_frame()
clubs_count.style.set_table_styles(
[{'selector': 'th',
  'props': [('background', '#337F93'),
            ('color', 'white'),
            ('font-family', 'verdana'),
            ('font-size', '10pt')]},
   
 {'selector': 'td',
  'props': [('font-family', 'verdana'),
            ('padding','0em 0em')]},
            
 {'selector': 'tr:nth-of-type(odd)',
  'props': [('background', '#ABB2B9'),
            ('color', 'black')]},
  
  {'selector': 'tr:nth-of-type(even)',
  'props': [('background', 'white'),
            ('color', 'black')]},
 
  {'selector': 'tr:hover',
   'props': [('background-color', 'pink')]},
 
  {'selector': 'th:hover',
    'props': [('font-size', '18pt')]},
   
   {'selector': 'tr:hover td:hover',
    'props': [('max-width', '1000px'),
              ('font-size', '18pt')]}

]
)
Clubwise Positions of the Players within each Cluster : 

 		 👇🏻👇🏻👇🏻

club
cluster positions club
0 Attacker AS Monaco 1
Ajax 3
Al Hilal 1
Arsenal 3
Atalanta 2
Atlanta United 1
Bayer 04 Leverkusen 2
Borussia Dortmund 1
Chelsea 2
Crystal Palace 1
FC Barcelona 4
FC Bayern München 2
Fenerbahçe SK 1
Grêmio 1
Juventus 2
Levante UD 1
Liverpool 3
Los Angeles FC 1
Manchester City 5
Manchester United 3
Milan 1
Napoli 2
Olympique Lyonnais 1
Olympique de Marseille 1
PSV 2
Paris Saint-Germain 3
RB Leipzig 1
RC Celta 1
Real Madrid 4
Real Sociedad 1
Sampdoria 1
Shanghai Greenland Shenhua FC 1
TSG 1899 Hoffenheim 1
Tottenham Hotspur 1
Valencia CF 1
Midfielder AS Monaco 1
Ajax 1
Al Nassr 1
Arsenal 1
Athletic Club de Bilbao 1
Atlético Madrid 2
Bayer 04 Leverkusen 1
Beşiktaş JK 2
Borussia Dortmund 6
Bournemouth 1
Chicago Fire 1
Dalian YiFang FC 1
FC Bayern München 3
FC Porto 1
Fluminense 1
Inter 1
Juventus 2
Lazio 1
Manchester City 3
Manchester United 1
Medipol Başakşehir FK 1
Napoli 1
Olympique de Marseille 1
Paris Saint-Germain 2
RB Leipzig 1
Real Betis 2
Real Madrid 3
SL Benfica 1
Shakhtar Donetsk 2
Shanghai SIPG FC 1
Sporting CP 1
Tottenham Hotspur 2
Uruguay 2
Valencia CF 1
West Ham United 1
1 Goalkeeper 1. FC Köln 1
AS Monaco 1
AS Saint-Étienne 1
Ajax 1
Arsenal 1
Atlético Madrid 2
Bayer 04 Leverkusen 1
Borussia Dortmund 1
Borussia Mönchengladbach 1
Cagliari 1
Chelsea 1
Deportivo Alavés 1
Eintracht Frankfurt 1
Everton 1
FC Barcelona 2
FC Bayern München 1
FC Porto 1
Galatasaray SK 1
Getafe CF 1
Grêmio 1
Hertha BSC 1
Inter 1
Juventus 3
Lazio 1
Leicester City 1
Liverpool 1
Manchester City 1
Manchester United 1
Milan 2
Montpellier HSC 1
Olympique Lyonnais 1
Paris Saint-Germain 1
RB Leipzig 1
Real Madrid 2
Real Valladolid CF 1
Roma 1
SV Werder Bremen 1
Sassuolo 1
Sevilla FC 1
Sporting CP 1
TSG 1899 Hoffenheim 1
Torino 1
Tottenham Hotspur 1
Valencia CF 1
VfL Wolfsburg 1
Villarreal CF 1
West Ham United 1
Wolverhampton Wanderers 1
2 Attacker Atalanta 1
Athletic Club de Bilbao 1
Atlético Madrid 1
Chelsea 1
Juventus 1
RB Leipzig 1
Sevilla FC 1
Sporting CP 1
VfL Wolfsburg 1
West Ham United 1
Defender Ajax 2
Arsenal 1
Borussia Dortmund 3
Chelsea 2
FC Barcelona 1
FC Bayern München 1
Juventus 3
Liverpool 1
Manchester City 2
Manchester United 2
PFC CSKA Moscow 1
Paris Saint-Germain 3
Real Betis 1
Real Madrid 3
Roma 1
SL Benfica 1
Tottenham Hotspur 2
VfL Wolfsburg 1
Midfielder Arsenal 1
Athletic Club de Bilbao 1
Atlético Madrid 2
Atlético Mineiro 1
Boca Juniors 1
Borussia Dortmund 2
Crystal Palace 1
FC Barcelona 2
FC Porto 1
Guangzhou Evergrande Taobao FC 1
Guangzhou R&F FC 1
Juventus 4
Lazio 2
Leicester City 1
Liverpool 3
Manchester City 2
Manchester United 1
Milan 1
Olympique de Marseille 1
Paris Saint-Germain 1
Real Betis 1
Real Madrid 1
Real Sociedad 1
Sevilla FC 1
Tottenham Hotspur 1
Valencia CF 1
Watford 1
3 Defender Arsenal 1
Atlético Madrid 3
Bayer 04 Leverkusen 1
Borussia Dortmund 1
Borussia Mönchengladbach 1
Chelsea 1
Cruzeiro 1
FC Barcelona 2
FC Bayern München 2
FC Girondins de Bordeaux 1
FC Porto 1
Getafe CF 1
Grêmio 1
Inter 3
Jiangsu Suning FC 1
Juventus 2
LOSC Lille 1
Lazio 1
Liverpool 1
Lokomotiv Moscow 2
Manchester City 2
Manchester United 1
Milan 1
Napoli 2
Paris Saint-Germain 1
RB Leipzig 1
RSC Anderlecht 1
Real Madrid 2
SL Benfica 1
Sevilla FC 1
Sporting CP 2
Torino 2
Tottenham Hotspur 1
Uruguay 2
Valencia CF 2
Villarreal CF 1
Midfielder FC Bayern München 1
4 Attacker AS Monaco 1
Al Hilal 1
Athletic Club de Bilbao 1
Atlético Madrid 1
Atlético Mineiro 1
Bayer 04 Leverkusen 1
Beijing Sinobo Guoan FC 1
Borussia Mönchengladbach 1
Brescia 1
Ecuador 1
Guangzhou R&F FC 1
Inter 3
Juventus 1
LA Galaxy 1
Lazio 1
Leicester City 1
Liverpool 1
Napoli 1
Paris Saint-Germain 1
Racing Club 1
Real Betis 1
Real Madrid 2
Real Sociedad 1
Roma 2
Shanghai SIPG FC 1
Sporting CP 1
Torino 1
Tottenham Hotspur 1
Uruguay 1
Valencia CF 1
Villarreal CF 1
Defender Everton 1
FC Barcelona 2
FC Bayern München 1
FC Porto 1
Leicester City 1
Liverpool 2
Manchester City 1
Real Madrid 1
SL Benfica 1
Valencia CF 1
Midfielder AS Monaco 1
Ajax 1
Arsenal 3
Athletic Club de Bilbao 1
Atlético Madrid 2
Bayer 04 Leverkusen 2
Beijing Sinobo Guoan FC 1
Cagliari 1
Chelsea 3
Dalian YiFang FC 1
Eintracht Frankfurt 1
Everton 1
FC Barcelona 5
FC Bayern München 5
Guangzhou Evergrande Taobao FC 1
Inter 2
Juventus 3
Lazio 1
Liverpool 2
Manchester City 1
Manchester United 2
Milan 1
Napoli 3
Olympique Lyonnais 2
Paris Saint-Germain 2
RB Leipzig 2
Real Betis 1
Real Madrid 1
SL Benfica 1
Sevilla FC 3
Tottenham Hotspur 3
Valencia CF 1
Villarreal CF 1
West Ham United 1
Wolverhampton Wanderers 2

Evaluation of K-Means Clustering

📝📝 Clustering analysis doesnt have a solid evaluation metric that we can use to evaluate the outcome of different clustering algorithms. In cluster-predict methodology, we can evaluate how well the models are performing based on different k clusters since clusters are used in the downstream modeling. We will discuss two metrics that may give us some intuition about k:
✏️ Elbow Method
✏️ Silhouette Analysis

# Standardize the data
X_std = StandardScaler().fit_transform(data)

i. Elbow Method

number_clusters = range(1, 8)

kmeans = [KMeans(n_clusters=i, max_iter = 600) for i in number_clusters]
kmeans

score = [kmeans[i].fit(X_std).score(X_std) for i in range(len(kmeans))]
score

plt.plot(number_clusters, score, color='#DB4462')
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Method')
plt.show()

📝📝 For k=3, the curve starts to flatten out and forming an elbow.

ii. Silhouette Analysis

for i, k in enumerate([2,3, 4, 5, 6,7]):
    fig, (ax1, ax2) = plt.subplots(2,1)
    fig.set_size_inches(20, 10)
    
    # Run the Kmeans algorithm
    km = KMeans(n_clusters=k)
    labels = km.fit_predict(X_std)
    centroids = km.cluster_centers_

    # Get silhouette samples
    silhouette_vals = silhouette_samples(X_std, labels)

    # Silhouette plot
    y_ticks = []
    y_lower, y_upper = 0, 0
    for i, cluster in enumerate(np.unique(labels)):
        cluster_silhouette_vals = silhouette_vals[labels == cluster]
        cluster_silhouette_vals.sort()
        y_upper += len(cluster_silhouette_vals)
        ax1.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
        ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
        y_lower += len(cluster_silhouette_vals)

    # Get the average silhouette score and plot it
    avg_score = np.mean(silhouette_vals)
    ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green')
    ax1.set_yticks([])
    ax1.set_xlim([-0.1, 1])
    ax1.set_xlabel('Silhouette coefficient values',fontsize = 20)
    ax1.set_ylabel('Cluster labels',fontsize = 20)
    ax1.set_title('Silhouette plot for the various clusters', y=1.02,fontsize = 20)
    
    # Scatter plot of data colored with labels

    ax2.scatter(X_std[:, 0], X_std[:, 1], c=labels)
    ax2.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='r', s=250)
    ax2.set_xlim([-15, 15])
    ax2.set_ylim([-10, 10])
    ax2.set_xlabel('PCA1',fontsize = 20)
    ax2.set_ylabel('PCA2',fontsize = 20)
    ax2.set_title('Visualization of clustered data', y=1.02,fontsize = 20)
    ax2.set_aspect('equal')
    
    
    plt.tight_layout()
    plt.suptitle(f'Silhouette Analysis using K = {k}',fontsize=25, fontweight='semibold', y=1.05)
    

📝📝 Essence of above plot :
✏️ As the above plot show, n_clusters=2 has the best average silhouette score of around 0.5 and but all the clusters are not above the average shows that its not a good choice. Also the thickness of the plot gives an indication of how big each cluster is. The plot shows that cluster 1 is so much bigger than the cluster 2.
✏️ However, as we increase n_cluster=3, the average silhouette score decreased to around 0.25 and all the clusters are above the average. So its a better choice than n_cluster=2.

Applying K-Means with a new no. of clusters

📝📝 So we will again plot k-means clustering for n_clusters=3 as discussed above.

print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Plotting K-Means Clustering "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

kmeans_clustering(data,3)
				 📈📈📈📈📈 Plotting K-Means Clustering  📈📈📈📈📈

📝📝 Essence of above plot :
✏️ We can see three pretty distinct clusters here with particularly large separation for the green cluster indicating quite a difference in terms of the position of the players. The majority of the data is contained within the blue cluster, however.
✏️ The goal of kmeans is to group data points into distinct non-overlapping subgroups. It does a very good job when the clusters have a kind of spherical shapes. However, it suffers as the geometric shapes of clusters deviates from spherical shapes. Moreover, it also doesn’t learn the number of clusters from the data and requires it to be pre-defined.

Calculating Silhouette,Calinski Harabasz and Davies Bouldin Scores for K-means Clustering

# Fit K-Means
kmeans_1 = KMeans(n_clusters=3,random_state= 30)
# Use fit_predict to cluster the dataset
predictions = kmeans_1.fit_predict(pca_data.x_std)
# Calculate cluster validation metrics
score_kmeans_s = silhouette_score(pca_data.x_std, kmeans_1.labels_, metric='euclidean')
score_kmeans_c = calinski_harabasz_score(pca_data.x_std, kmeans_1.labels_)
score_kmeans_d = davies_bouldin_score(pca_data.x_std, predictions)

r1 = pd.DataFrame({'Scores': 'Silhouette Score', 'Values' : "{:.4f}".format(score_kmeans_s)},index={'1'})
r2 = pd.DataFrame({'Scores': 'Calinski Harabasz Score', 'Values' : "{:.4f}".format(score_kmeans_c)},index={'2'})
r3 = pd.DataFrame({'Scores': 'Davies Bouldin Score', 'Values' : "{:.4f}".format(score_kmeans_d)},index={'3'})

res1 = pd.concat([r1,r2,r3])
res1.columns = ['Scores','Values']
res1=res1.style.set_table_styles(
    [{'selector': 'th',
      'props': [('background', '#34495E'),
                ('color', 'white'),
                ('font-family', 'verdana'),
                ('font-size', '10pt')]},
   
     {'selector': 'td',
      'props': [('font-family', 'verdana'),
                ('padding','0em 0em')]},
            
     {'selector': 'tr:nth-of-type(odd)',
      'props': [('background', '#ABB2B9'),
                ('color', 'black')]},
  
     {'selector': 'tr:nth-of-type(even)',
      'props': [('background', 'white'),
                ('color', 'black')]},
 
     {'selector': 'tr:hover',
      'props': [('background-color', 'pink')]},
 
     {'selector': 'th:hover',
      'props': [('font-size', '18pt')]},
   
     {'selector': 'tr:hover td:hover',
      'props': [('max-width', '1000px'),
                ('font-size', '18pt')]}

    ]
).set_properties(**{'background-color': '#FEF5E7'}, subset=['Scores'])

res1
Scores Values
1 Silhouette Score 0.5892
2 Calinski Harabasz Score 1244.3950
3 Davies Bouldin Score 0.5150

2. Hierarchical Clustering

i. Hierarchical Clustering with Average Linkage

#collapse 
print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Plotting Hierarchical Clustering with Average Linkage "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize=(18,90))
plt.title('Hierarchical Clustering Dendrogram with Average Linkage')
dendrogram = sch.dendrogram(sch.linkage(std_data.x_standardized, method="average"), labels= names, leaf_font_size = 13, orientation='right')
				 📈📈📈📈📈 Plotting Hierarchical Clustering with Average Linkage  📈📈📈📈📈

📝📝 It grouped into three positions.

ii. Hierarchical Clustering with Single Linkage

#collapse 
print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Plotting Hierarchical Clustering with Single Linkage "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize=(18,90))
plt.title('Hierarchical Clustering Dendrogram with Single Linkage')
dendrogram = sch.dendrogram(sch.linkage(std_data.x_standardized, method="single"), labels= names, leaf_font_size = 13, orientation='right')
				 📈📈📈📈📈 Plotting Hierarchical Clustering with Single Linkage  📈📈📈📈📈

📝📝 It grouped into three positions and one group is almost negligible compared to other groups..

iii. Hierarchical Clustering with Centroid Linkage

#collapse 
print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Plotting Hierarchical Clustering with Centroid Linkage "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize=(18,90))
plt.title('Hierarchical Clustering Dendrogram with Centroid Linkage')
dendrogram = sch.dendrogram(sch.linkage(std_data.x_standardized, method="centroid"), labels= names, leaf_font_size = 13, orientation='right')
				 📈📈📈📈📈 Plotting Hierarchical Clustering with Centroid Linkage  📈📈📈📈📈

📝📝 It grouped into two positions.

iv. Hierarchical Clustering with Complete Linkage

#collapse 
print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Plotting Hierarchical Clustering with Complete Linkage "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

plt.figure(figsize=(18,90))
plt.title('Hierarchical Clustering Dendrogram with Centroid Linkage')
dendrogram = sch.dendrogram(sch.linkage(std_data.x_standardized, method="complete"), labels= names, leaf_font_size = 13, orientation='right')
				 📈📈📈📈📈 Plotting Hierarchical Clustering with Complete Linkage  📈📈📈📈📈

📝📝 It grouped into five positions.

📝📝 Hierarchical Clustering with Average Linkage groups the dataset better than single, Centroid and Complete Linkages

Calculating Silhouette,Calinski Harabasz and Davies Bouldin Scores for Hierarchical Clustering

# define the model
model = AgglomerativeClustering(n_clusters=3)
# fit model and predict clusters
yhat = model.fit(pca_data.x_std)
yhat_2 = model.fit_predict(pca_data.x_std)
# retrieve unique clusters
clusters = unique(yhat)
# Calculate cluster validation metrics
score_AGclustering_s = silhouette_score(pca_data.x_std, yhat.labels_, metric='euclidean')
score_AGclustering_c = calinski_harabasz_score(pca_data.x_std, yhat.labels_)
score_AGclustering_d = davies_bouldin_score(pca_data.x_std, yhat_2)

s1 = pd.DataFrame({'Scores': 'Silhouette Score', 'Values' : "{:.4f}".format(score_AGclustering_s)},index={'1'})
s2 = pd.DataFrame({'Scores': 'Calinski Harabasz Score', 'Values' : "{:.4f}".format(score_AGclustering_c)},index={'2'})
s3 = pd.DataFrame({'Scores': 'Davies Bouldin Score', 'Values' : "{:.4f}".format(score_AGclustering_d)},index={'3'})

res2 = pd.concat([s1,s2,s3])
res2.columns = ['Scores','Values']
res2=res2.style.set_table_styles(
    [{'selector': 'th',
      'props': [('background', '#34495E'),
                ('color', 'white'),
                ('font-family', 'verdana'),
                ('font-size', '10pt')]},
   
     {'selector': 'td',
      'props': [('font-family', 'verdana'),
                ('padding','0em 0em')]},
            
     {'selector': 'tr:nth-of-type(odd)',
      'props': [('background', '#ABB2B9'),
                ('color', 'black')]},
  
     {'selector': 'tr:nth-of-type(even)',
      'props': [('background', 'white'),
                ('color', 'black')]},
 
     {'selector': 'tr:hover',
      'props': [('background-color', 'pink')]},
 
     {'selector': 'th:hover',
      'props': [('font-size', '18pt')]},
   
     {'selector': 'tr:hover td:hover',
      'props': [('max-width', '1000px'),
                ('font-size', '18pt')]}

    ]
).set_properties(**{'background-color': '#FEF5E7'}, subset=['Scores'])

res2
Scores Values
1 Silhouette Score 0.5919
2 Calinski Harabasz Score 1211.5509
3 Davies Bouldin Score 0.4953

3. DBSCAN Clustering

# performing standardization
std_data(data)

# performing PCA
pca_data()

# DBSCAN clustering
def dbscan_clustering(e,n):
    # train the model using DBSCAN
    dbscan = DBSCAN(eps=e, min_samples=n)
    
    # the prediction for dbscan clusters
    clusters = dbscan.fit_predict(pca_data.x_std)
    
    # making a new dataframe by adding players' names,positions and their clusters
    dbscan_clustering.reduced = pca_data.x_std.copy()
    dbscan_clustering.reduced['cluster'] = clusters
    dbscan_clustering.reduced['name'] = names
    dbscan_clustering.reduced['positions'] = positions
    dbscan_clustering.reduced['club'] = club
    dbscan_clustering.reduced['nationality'] = nationality
    dbscan_clustering.reduced.columns = ['x', 'y', 'cluster', 'name','positions','club','nationality']
    
    ax = sns.lmplot(x="x", y="y", hue='cluster', data = dbscan_clustering.reduced, legend=False,palette = 'hls',fit_reg=False, size = 15)

    plt.legend()
    plt.ylim=(-10, 10)
    plt.title("DBSCAN Clustering",fontsize = 30)
    plt.tick_params(labelsize=15)
    plt.xlabel("PC 1", fontsize = 20)
    plt.ylabel("PC 2", fontsize = 20)
    
    
    plt.show()
    

# performing DBSCAN clustering and specifying min_samples=10 and Eps=1
print()
print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Plotting DBSCAN Clustering "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

dbscan_clustering(1,10)
				 📈📈📈📈📈 Plotting DBSCAN Clustering  📈📈📈📈📈

📝📝 Clusters are formed into Goalkeepers vs the rest. also this plot is not very accurate.

Finding an Optimal Epsilon

# calculate the distance from each point to its closest neighbor
nn = NearestNeighbors(n_neighbors = 2)

# fit the nearest neighbor
nbrs = nn.fit(pca_data.x_std)

# returns two arrays - distance to the closest n_neighbors points and index for each point
distances, indices = nbrs.kneighbors(pca_data.x_std)

# sort the distance and plot it
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances, color='#098BAE')
[<matplotlib.lines.Line2D at 0x7f87fd3bd790>]

📝📝 It looks like the curvature starts picking up at around eps=0.62 and can be considered as optimal epsilon for this dataset.

Applying DBSCAN with a new Eps

# performing DBSCAN clustering and specifying min_samples=11 and Eps=0.62
print()
print("\t\t\t\t",emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5
      ,"Plotting DBSCAN Clustering "
      ,emoji.emojize(":chart_with_upwards_trend:", use_aliases=True)*5)
print("")

dbscan_clustering(0.62,11)
				 📈📈📈📈📈 Plotting DBSCAN Clustering  📈📈📈📈📈

📝📝 DBSCAN with Eps = 0.62 and min_samples = 11 is doing a better job at grouping and detecting outliers.

Calculating Silhouette,Calinski Harabasz and Davies Bouldin Scores for DBSCAN Clustering

# define the model
model = DBSCAN(eps=0.62, min_samples= 11)

# fit model and predict clusters
yhat = model.fit_predict(pca_data.x_std)
# retrieve unique clusters
clusters = unique(yhat)
# Calculate cluster validation metrics
score_dbsacn_s = silhouette_score(pca_data.x_std, yhat, metric='euclidean')
score_dbsacn_c = calinski_harabasz_score(pca_data.x_std, yhat)
score_dbsacn_d = davies_bouldin_score(pca_data.x_std, yhat)

t1 = pd.DataFrame({'Scores': 'Silhouette Score', 'Values' : "{:.4f}".format(score_dbsacn_s)},index={'1'})
t2 = pd.DataFrame({'Scores': 'Calinski Harabasz Score', 'Values' : "{:.4f}".format(score_dbsacn_c)},index={'2'})
t3 = pd.DataFrame({'Scores': 'Davies Bouldin Score', 'Values' : "{:.4f}".format(score_dbsacn_d)},index={'3'})

res3 = pd.concat([t1,t2,t3])
res3.columns = ['Scores','Values']
res3=res3.style.set_table_styles(
    [{'selector': 'th',
      'props': [('background', '#34495E'),
                ('color', 'white'),
                ('font-family', 'verdana'),
                ('font-size', '10pt')]},
   
     {'selector': 'td',
      'props': [('font-family', 'verdana'),
                ('padding','0em 0em')]},
            
     {'selector': 'tr:nth-of-type(odd)',
      'props': [('background', '#ABB2B9'),
                ('color', 'black')]},
  
     {'selector': 'tr:nth-of-type(even)',
      'props': [('background', 'white'),
                ('color', 'black')]},
 
     {'selector': 'tr:hover',
      'props': [('background-color', 'pink')]},
 
     {'selector': 'th:hover',
      'props': [('font-size', '18pt')]},
   
     {'selector': 'tr:hover td:hover',
      'props': [('max-width', '1000px'),
                ('font-size', '18pt')]}

    ]
).set_properties(**{'background-color': '#FEF5E7'}, subset=['Scores'])

res3
Scores Values
1 Silhouette Score 0.2742
2 Calinski Harabasz Score 259.0584
3 Davies Bouldin Score 2.2460

Conclusion

z1 = pd.DataFrame({'Metrics': 'Silhouette Score'
                  ,'K-Means Clustering' : "{:.4f}".format(score_kmeans_s)
                  ,'Hierarchical Clustering' : "{:.4f}".format(score_AGclustering_s)
                  ,'DBSCAN Clustering': "{:.4f}".format(score_dbsacn_s)},index={'1'})

z2 = pd.DataFrame({'Metrics': 'Calinski Harabasz Score'
                   ,'K-Means Clustering' : "{:.4f}".format(score_kmeans_c)
                   ,'Hierarchical Clustering' : "{:.4f}".format(score_AGclustering_c)
                   ,'DBSCAN Clustering': "{:.4f}".format(score_dbsacn_c)},index={'2'})

z3 = pd.DataFrame({'Metrics': 'Davies Bouldin Score'
                  ,'K-Means Clustering' : "{:.4f}".format(score_kmeans_d)
                  ,'Hierarchical Clustering' : "{:.4f}".format(score_AGclustering_d)
                  ,'DBSCAN Clustering': "{:.4f}".format(score_dbsacn_d)},index={'3'})

result = pd.concat([z1,z2,z3])
result.columns = ['Metrics','K-Means Clustering','Hierarchical Clustering','DBSCAN Clustering']
result=result.style.set_table_styles(
    [{'selector': 'th',
      'props': [('background', '#66070E'),
                ('color', 'white'),
                ('font-family', 'verdana'),
                ('font-size', '10pt')]},
   
     {'selector': 'td',
      'props': [('font-family', 'verdana'),
                ('padding','0em 0em')]},
            
     {'selector': 'tr:nth-of-type(odd)',
      'props': [('background', '#ABB2B9'),
                ('color', 'black')]},
  
     {'selector': 'tr:nth-of-type(even)',
      'props': [('background', 'white'),
                ('color', 'black')]},
 
     {'selector': 'tr:hover',
      'props': [('background-color', 'pink')]},
 
     {'selector': 'th:hover',
      'props': [('font-size', '18pt')]},
   
     {'selector': 'tr:hover td:hover',
      'props': [('max-width', '1000px'),
                ('font-size', '18pt')]}

    ]
).set_properties(**{'background-color': '#E7A4A9'}, subset=['Metrics'])

result
Metrics K-Means Clustering Hierarchical Clustering DBSCAN Clustering
1 Silhouette Score 0.5892 0.5919 0.2742
2 Calinski Harabasz Score 1244.3950 1211.5509 259.0584
3 Davies Bouldin Score 0.5150 0.4953 2.2460

📝📝 We can see that K-means outperforms Hierarchical and DBSCAN clusterings based on all cluster validation metrics.

</div>